about summary refs log tree commit diff
path: root/kittybox-rs/src/webmentions/check.rs
blob: f7322f7f86b556d54701483253203b187692d209 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
use std::{cell::RefCell, rc::Rc};
use microformats::{types::PropertyValue, html5ever::{self, tendril::TendrilSink}};
use kittybox_util::MentionType;

#[derive(thiserror::Error, Debug)]
pub enum Error {
    #[error("microformats error: {0}")]
    Microformats(#[from] microformats::Error),
    // #[error("json error: {0}")]
    // Json(#[from] serde_json::Error),
    #[error("url parse error: {0}")]
    UrlParse(#[from] url::ParseError),
}

#[tracing::instrument]
pub fn check_mention(document: impl AsRef<str> + std::fmt::Debug, base_url: &url::Url, link: &url::Url) -> Result<Option<(MentionType, serde_json::Value)>, Error> {
    tracing::debug!("Parsing MF2 markup...");
    // First, check the document for MF2 markup
    let document = microformats::from_html(document.as_ref(), base_url.clone())?;

    // Get an iterator of all items
    let items_iter = document.items.iter()
        .map(AsRef::as_ref)
        .map(RefCell::borrow);

    for item in items_iter {
        tracing::debug!("Processing item: {:?}", item);

        let props = item.properties.borrow();
        for (prop, interaction_type) in [
            ("in-reply-to", MentionType::Reply), ("like-of", MentionType::Like),
            ("bookmark-of", MentionType::Bookmark), ("repost-of", MentionType::Repost)
        ] {
            if let Some(propvals) = props.get(prop) {
                tracing::debug!("Has a u-{} property", prop);
                for val in propvals {
                    if let PropertyValue::Url(url) = val {
                        if url == link {
                            tracing::debug!("URL matches! Webmention is valid");
                            return Ok(Some((interaction_type, serde_json::to_value(&*item).unwrap())))
                        }
                    }
                }
            }
        }
        // Process `content`
        tracing::debug!("Processing e-content...");
        if let Some(PropertyValue::Fragment(content)) = props.get("content")
            .map(Vec::as_slice)
            .unwrap_or_default()
            .first()
        {
            tracing::debug!("Parsing HTML data...");
            let root = html5ever::parse_document(html5ever::rcdom::RcDom::default(), Default::default())
                .from_utf8()
                .one(content.html.to_owned().as_bytes())
                .document;

            // This is a trick to unwrap recursion into a loop
            //
            // A list of unprocessed node is made. Then, in each
            // iteration, the list is "taken" and replaced with an
            // empty list, which is populated with nodes for the next
            // iteration of the loop.
            //
            // Empty list means all nodes were processed.
            let mut unprocessed_nodes: Vec<Rc<html5ever::rcdom::Node>> = root.children.borrow().iter().cloned().collect();
            while !unprocessed_nodes.is_empty() {
                // "Take" the list out of its memory slot, replace it with an empty list
                let nodes = std::mem::take(&mut unprocessed_nodes);
                tracing::debug!("Processing list of {} nodes", nodes.len());
                'nodes_loop: for node in nodes.into_iter() {
                    // Add children nodes to the list for the next iteration
                    unprocessed_nodes.extend(node.children.borrow().iter().cloned());

                    if let html5ever::rcdom::NodeData::Element { ref name, ref attrs, .. } = node.data {
                        // If it's not `<a>`, skip it
                        if name.local != *"a" { continue; }
                        let mut is_mention: bool = false;
                        for attr in attrs.borrow().iter() {
                            if attr.name.local == *"rel" {
                                // Don't count `rel="nofollow"` links — a web crawler should ignore them
                                // and so for purposes of driving visitors they are useless
                                if attr.value
                                    .as_ref()
                                    .split([',', ' '])
                                    .any(|v| v == "nofollow")
                                {
                                    // Skip the entire node.
                                    continue 'nodes_loop;
                                }
                            }
                            // if it's not `<a href="...">`, skip it 
                            if attr.name.local != *"href" { continue; }
                            // Be forgiving in parsing URLs, and resolve them against the base URL
                            if let Ok(url) = base_url.join(attr.value.as_ref()) {
                                if &url == link {
                                    is_mention = true;
                                }
                            }
                        }
                        if is_mention {
                            return Ok(Some((MentionType::Mention, serde_json::to_value(&*item).unwrap())));
                        }
                    }
                }
            }
            
        }
    }

    Ok(None)
}