about summary refs log tree commit diff
path: root/src/lib/strings/rich-text-detection.ts
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/strings/rich-text-detection.ts')
-rw-r--r--src/lib/strings/rich-text-detection.ts107
1 files changed, 107 insertions, 0 deletions
diff --git a/src/lib/strings/rich-text-detection.ts b/src/lib/strings/rich-text-detection.ts
new file mode 100644
index 000000000..386ed48e1
--- /dev/null
+++ b/src/lib/strings/rich-text-detection.ts
@@ -0,0 +1,107 @@
+import {AppBskyFeedPost} from '@atproto/api'
+type Entity = AppBskyFeedPost.Entity
+import {isValidDomain} from './url-helpers'
+
+export function extractEntities(
+  text: string,
+  knownHandles?: Set<string>,
+): Entity[] | undefined {
+  let match
+  let ents: Entity[] = []
+  {
+    // mentions
+    const re = /(^|\s|\()(@)([a-zA-Z0-9.-]+)(\b)/g
+    while ((match = re.exec(text))) {
+      if (knownHandles && !knownHandles.has(match[3])) {
+        continue // not a known handle
+      } else if (!match[3].includes('.')) {
+        continue // probably not a handle
+      }
+      const start = text.indexOf(match[3], match.index) - 1
+      ents.push({
+        type: 'mention',
+        value: match[3],
+        index: {start, end: start + match[3].length + 1},
+      })
+    }
+  }
+  {
+    // links
+    const re =
+      /(^|\s|\()((https?:\/\/[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))/gim
+    while ((match = re.exec(text))) {
+      let value = match[2]
+      if (!value.startsWith('http')) {
+        const domain = match.groups?.domain
+        if (!domain || !isValidDomain(domain)) {
+          continue
+        }
+        value = `https://${value}`
+      }
+      const start = text.indexOf(match[2], match.index)
+      const index = {start, end: start + match[2].length}
+      // strip ending puncuation
+      if (/[.,;!?]$/.test(value)) {
+        value = value.slice(0, -1)
+        index.end--
+      }
+      if (/[)]$/.test(value) && !value.includes('(')) {
+        value = value.slice(0, -1)
+        index.end--
+      }
+      ents.push({
+        type: 'link',
+        value,
+        index,
+      })
+    }
+  }
+  return ents.length > 0 ? ents : undefined
+}
+
+interface DetectedLink {
+  link: string
+}
+type DetectedLinkable = string | DetectedLink
+export function detectLinkables(text: string): DetectedLinkable[] {
+  const re =
+    /((^|\s|\()@[a-z0-9.-]*)|((^|\s|\()https?:\/\/[\S]+)|((^|\s|\()(?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*)/gi
+  const segments = []
+  let match
+  let start = 0
+  while ((match = re.exec(text))) {
+    let matchIndex = match.index
+    let matchValue = match[0]
+
+    if (match.groups?.domain && !isValidDomain(match.groups?.domain)) {
+      continue
+    }
+
+    if (/\s|\(/.test(matchValue)) {
+      // HACK
+      // skip the starting space
+      // we have to do this because RN doesnt support negative lookaheads
+      // -prf
+      matchIndex++
+      matchValue = matchValue.slice(1)
+    }
+
+    // strip ending puncuation
+    if (/[.,;!?]$/.test(matchValue)) {
+      matchValue = matchValue.slice(0, -1)
+    }
+    if (/[)]$/.test(matchValue) && !matchValue.includes('(')) {
+      matchValue = matchValue.slice(0, -1)
+    }
+
+    if (start !== matchIndex) {
+      segments.push(text.slice(start, matchIndex))
+    }
+    segments.push({link: matchValue})
+    start = matchIndex + matchValue.length
+  }
+  if (start < text.length) {
+    segments.push(text.slice(start))
+  }
+  return segments
+}