1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
use std::{cell::RefCell, rc::Rc};
use microformats::{types::PropertyValue, html5ever::{self, tendril::TendrilSink}};
use kittybox_util::MentionType;
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("microformats error: {0}")]
Microformats(#[from] microformats::Error),
// #[error("json error: {0}")]
// Json(#[from] serde_json::Error),
#[error("url parse error: {0}")]
UrlParse(#[from] url::ParseError),
}
#[tracing::instrument]
pub fn check_mention(document: impl AsRef<str> + std::fmt::Debug, base_url: &url::Url, link: &url::Url) -> Result<Option<(MentionType, serde_json::Value)>, Error> {
tracing::debug!("Parsing MF2 markup...");
// First, check the document for MF2 markup
let document = microformats::from_html(document.as_ref(), base_url.clone())?;
// Get an iterator of all items
let items_iter = document.items.iter()
.map(AsRef::as_ref)
.map(RefCell::borrow);
for item in items_iter {
tracing::debug!("Processing item: {:?}", item);
let props = item.properties.borrow();
for (prop, interaction_type) in [
("in-reply-to", MentionType::Reply), ("like-of", MentionType::Like),
("bookmark-of", MentionType::Bookmark), ("repost-of", MentionType::Repost)
] {
if let Some(propvals) = props.get(prop) {
tracing::debug!("Has a u-{} property", prop);
for val in propvals {
if let PropertyValue::Url(url) = val {
if url == link {
tracing::debug!("URL matches! Webmention is valid");
return Ok(Some((interaction_type, serde_json::to_value(&*item).unwrap())))
}
}
}
}
}
// Process `content`
tracing::debug!("Processing e-content...");
if let Some(PropertyValue::Fragment(content)) = props.get("content")
.map(Vec::as_slice)
.unwrap_or_default()
.first()
{
tracing::debug!("Parsing HTML data...");
let root = html5ever::parse_document(html5ever::rcdom::RcDom::default(), Default::default())
.from_utf8()
.one(content.html.to_owned().as_bytes())
.document;
// This is a trick to unwrap recursion into a loop
//
// A list of unprocessed node is made. Then, in each
// iteration, the list is "taken" and replaced with an
// empty list, which is populated with nodes for the next
// iteration of the loop.
//
// Empty list means all nodes were processed.
let mut unprocessed_nodes: Vec<Rc<html5ever::rcdom::Node>> = root.children.borrow().iter().cloned().collect();
while !unprocessed_nodes.is_empty() {
// "Take" the list out of its memory slot, replace it with an empty list
let nodes = std::mem::take(&mut unprocessed_nodes);
tracing::debug!("Processing list of {} nodes", nodes.len());
'nodes_loop: for node in nodes.into_iter() {
// Add children nodes to the list for the next iteration
unprocessed_nodes.extend(node.children.borrow().iter().cloned());
if let html5ever::rcdom::NodeData::Element { ref name, ref attrs, .. } = node.data {
// If it's not `<a>`, skip it
if name.local != *"a" { continue; }
let mut is_mention: bool = false;
for attr in attrs.borrow().iter() {
if attr.name.local == *"rel" {
// Don't count `rel="nofollow"` links — a web crawler should ignore them
// and so for purposes of driving visitors they are useless
if attr.value
.as_ref()
.split([',', ' '])
.any(|v| v == "nofollow")
{
// Skip the entire node.
continue 'nodes_loop;
}
}
// if it's not `<a href="...">`, skip it
if attr.name.local != *"href" { continue; }
// Be forgiving in parsing URLs, and resolve them against the base URL
if let Ok(url) = base_url.join(attr.value.as_ref()) {
if &url == link {
is_mention = true;
}
}
}
if is_mention {
return Ok(Some((MentionType::Mention, serde_json::to_value(&*item).unwrap())));
}
}
}
}
}
}
Ok(None)
}
|