src/lib/strings/rich-text-detection.ts


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

import {AppBskyFeedPost} from '@atproto/api'
type Entity = AppBskyFeedPost.Entity
import {isValidDomain} from './url-helpers'

export function extractEntities(
  text: string,
  knownHandles?: Set<string>,
): Entity[] | undefined {
  let match
  let ents: Entity[] = []
  {
    // mentions
    const re = /(^|\s|\()(@)([a-zA-Z0-9.-]+)(\b)/g
    while ((match = re.exec(text))) {
      if (knownHandles && !knownHandles.has(match[3])) {
        continue // not a known handle
      } else if (!match[3].includes('.')) {
        continue // probably not a handle
      }
      const start = text.indexOf(match[3], match.index) - 1
      ents.push({
        type: 'mention',
        value: match[3],
        index: {start, end: start + match[3].length + 1},
      })
    }
  }
  {
    // links
    const re =
      /(^|\s|\()((https?:\/\/[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))/gim
    while ((match = re.exec(text))) {
      let value = match[2]
      if (!value.startsWith('http')) {
        const domain = match.groups?.domain
        if (!domain || !isValidDomain(domain)) {
          continue
        }
        value = `https://${value}`
      }
      const start = text.indexOf(match[2], match.index)
      const index = {start, end: start + match[2].length}
      // strip ending puncuation
      if (/[.,;!?]$/.test(value)) {
        value = value.slice(0, -1)
        index.end--
      }
      if (/[)]$/.test(value) && !value.includes('(')) {
        value = value.slice(0, -1)
        index.end--
      }
      ents.push({
        type: 'link',
        value,
        index,
      })
    }
  }
  return ents.length > 0 ? ents : undefined
}

interface DetectedLink {
  link: string
}
type DetectedLinkable = string | DetectedLink
export function detectLinkables(text: string): DetectedLinkable[] {
  const re =
    /((^|\s|\()@[a-z0-9.-]*)|((^|\s|\()https?:\/\/[\S]+)|((^|\s|\()(?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*)/gi
  const segments = []
  let match
  let start = 0
  while ((match = re.exec(text))) {
    let matchIndex = match.index
    let matchValue = match[0]

    if (match.groups?.domain && !isValidDomain(match.groups?.domain)) {
      continue
    }

    if (/\s|\(/.test(matchValue)) {
      // HACK
      // skip the starting space
      // we have to do this because RN doesnt support negative lookaheads
      // -prf
      matchIndex++
      matchValue = matchValue.slice(1)
    }

    // strip ending puncuation
    if (/[.,;!?]$/.test(matchValue)) {
      matchValue = matchValue.slice(0, -1)
    }
    if (/[)]$/.test(matchValue) && !matchValue.includes('(')) {
      matchValue = matchValue.slice(0, -1)
    }

    if (start !== matchIndex) {
      segments.push(text.slice(start, matchIndex))
    }
    segments.push({link: matchValue})
    start = matchIndex + matchValue.length
  }
  if (start < text.length) {
    segments.push(text.slice(start))
  }
  return segments
}