src/lib/extractHtmlMeta.ts


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71

import {extractTwitterMeta} from './extractTwitterMeta'
import {extractYoutubeMeta} from './extractYoutubeMeta'

interface ExtractHtmlMetaInput {
  html: string
  hostname?: string
  pathname?: string
}

export const extractHtmlMeta = ({
  html,
  hostname,
  pathname,
}: ExtractHtmlMetaInput): Record<string, string> => {
  const htmlTitleRegex = /<title.*>([^<]+)<\/title>/i

  let res: Record<string, string> = {}

  const match = htmlTitleRegex.exec(html)

  if (match) {
    res.title = match[1].trim()
  }

  let metaMatch
  let propMatch
  const metaRe = /<meta[\s]([^>]+)>/gis
  while ((metaMatch = metaRe.exec(html))) {
    let propName
    let propValue
    const propRe = /(name|property|content)="([^"]+)"/gis
    while ((propMatch = propRe.exec(metaMatch[1]))) {
      if (propMatch[1] === 'content') {
        propValue = propMatch[2]
      } else {
        propName = propMatch[2]
      }
    }
    if (!propName || !propValue) {
      continue
    }
    switch (propName?.trim()) {
      case 'title':
      case 'og:title':
      case 'twitter:title':
        res.title = propValue?.trim()
        break
      case 'description':
      case 'og:description':
      case 'twitter:description':
        res.description = propValue?.trim()
        break
      case 'og:image':
      case 'twitter:image':
        res.image = propValue?.trim()
        break
    }
  }

  const isYoutubeUrl =
    hostname?.includes('youtube.') || hostname?.includes('youtu.be')
  const isTwitterUrl = hostname?.includes('twitter.')
  // Workaround for some websites not having a title or description in the meta tags in the initial serve
  if (isYoutubeUrl) {
    res = {...res, ...extractYoutubeMeta(html)}
  } else if (isTwitterUrl) {
    res = {...extractTwitterMeta({pathname})}
  }

  return res
}