about summary refs log tree commit diff
path: root/src/webmentions/check.rs
blob: 380f4dbe12159c03eab6d4d5236e20a83d25c934 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
use html5ever::{self, tendril::TendrilSink};
use kittybox_util::MentionType;
use microformats::types::PropertyValue;
use std::rc::Rc;

// TODO: replace.
mod rcdom;

#[derive(thiserror::Error, Debug)]
pub enum Error {
    #[error("microformats error: {0}")]
    Microformats(#[from] microformats::Error),
    // #[error("json error: {0}")]
    // Json(#[from] serde_json::Error),
    #[error("url parse error: {0}")]
    UrlParse(#[from] url::ParseError),
}

#[tracing::instrument]
pub fn check_mention(
    document: impl AsRef<str> + std::fmt::Debug,
    base_url: &url::Url,
    link: &url::Url,
) -> Result<Option<(MentionType, serde_json::Value)>, Error> {
    tracing::debug!("Parsing MF2 markup...");
    // First, check the document for MF2 markup
    let document = microformats::from_html(document.as_ref(), base_url.clone())?;

    // Get an iterator of all items
    let items_iter = document.items.iter();

    for item in items_iter {
        tracing::debug!("Processing item: {:?}", item);

        for (prop, interaction_type) in [
            ("in-reply-to", MentionType::Reply),
            ("like-of", MentionType::Like),
            ("bookmark-of", MentionType::Bookmark),
            ("repost-of", MentionType::Repost),
        ] {
            if let Some(propvals) = item.properties.get(prop) {
                tracing::debug!("Has a u-{} property", prop);
                for val in propvals {
                    if let PropertyValue::Url(url) = val {
                        if url == link {
                            tracing::debug!("URL matches! Webmention is valid");
                            return Ok(Some((
                                interaction_type,
                                serde_json::to_value(item).unwrap(),
                            )));
                        }
                    }
                }
            }
        }
        // Process `content`
        tracing::debug!("Processing e-content...");
        if let Some(PropertyValue::Fragment(content)) = item
            .properties
            .get("content")
            .map(Vec::as_slice)
            .unwrap_or_default()
            .first()
        {
            tracing::debug!("Parsing HTML data...");
            let root = html5ever::parse_document(rcdom::RcDom::default(), Default::default())
                .from_utf8()
                .one(content.html.to_owned().as_bytes())
                .document;

            // This is a trick to unwrap recursion into a loop
            //
            // A list of unprocessed node is made. Then, in each
            // iteration, the list is "taken" and replaced with an
            // empty list, which is populated with nodes for the next
            // iteration of the loop.
            //
            // Empty list means all nodes were processed.
            let mut unprocessed_nodes: Vec<Rc<rcdom::Node>> =
                root.children.borrow().iter().cloned().collect();
            while !unprocessed_nodes.is_empty() {
                // "Take" the list out of its memory slot, replace it with an empty list
                let nodes = std::mem::take(&mut unprocessed_nodes);
                tracing::debug!("Processing list of {} nodes", nodes.len());
                'nodes_loop: for node in nodes.into_iter() {
                    // Add children nodes to the list for the next iteration
                    unprocessed_nodes.extend(node.children.borrow().iter().cloned());

                    if let rcdom::NodeData::Element {
                        ref name,
                        ref attrs,
                        ..
                    } = node.data
                    {
                        // If it's not `<a>`, skip it
                        if name.local != *"a" {
                            continue;
                        }
                        let mut is_mention: bool = false;
                        for attr in attrs.borrow().iter() {
                            if attr.name.local == *"rel" {
                                // Don't count `rel="nofollow"` links — a web crawler should ignore them
                                // and so for purposes of driving visitors they are useless
                                if attr
                                    .value
                                    .as_ref()
                                    .split([',', ' '])
                                    .any(|v| v == "nofollow")
                                {
                                    // Skip the entire node.
                                    continue 'nodes_loop;
                                }
                            }
                            // if it's not `<a href="...">`, skip it
                            if attr.name.local != *"href" {
                                continue;
                            }
                            // Be forgiving in parsing URLs, and resolve them against the base URL
                            if let Ok(url) = base_url.join(attr.value.as_ref()) {
                                if &url == link {
                                    is_mention = true;
                                }
                            }
                        }
                        if is_mention {
                            return Ok(Some((
                                MentionType::Mention,
                                serde_json::to_value(item).unwrap(),
                            )));
                        }
                    }
                }
            }
        }
    }

    Ok(None)
}