use std::{cell::RefCell, rc::Rc}; use microformats::{types::PropertyValue, html5ever::{self, tendril::TendrilSink}}; use kittybox_util::MentionType; #[derive(thiserror::Error, Debug)] pub enum Error { #[error("microformats error: {0}")] Microformats(#[from] microformats::Error), // #[error("json error: {0}")] // Json(#[from] serde_json::Error), #[error("url parse error: {0}")] UrlParse(#[from] url::ParseError), } pub fn check_mention(document: impl AsRef, base_url: &url::Url, link: &url::Url) -> Result, Error> { // First, check the document for MF2 markup let document = microformats::from_html(document.as_ref(), base_url.clone())?; // Get an iterator of all items let items_iter = document.items.iter() .map(AsRef::as_ref) .map(RefCell::borrow); for item in items_iter { let props = item.properties.borrow(); for (prop, interaction_type) in [ ("in-reply-to", MentionType::Reply), ("like-of", MentionType::Like), ("bookmark-of", MentionType::Bookmark), ("repost-of", MentionType::Repost) ] { if let Some(propvals) = props.get(prop) { for val in propvals { if let PropertyValue::Url(url) = val { if url == link { return Ok(Some((interaction_type, serde_json::to_value(&*item).unwrap()))) } } } } } // Process `content` if let Some(PropertyValue::Fragment(content)) = props.get("content") .map(Vec::as_slice) .unwrap_or_default() .first() { let root = html5ever::parse_document(html5ever::rcdom::RcDom::default(), Default::default()) .from_utf8() .one(content.html.to_owned().as_bytes()) .document; // This is a trick to unwrap recursion into a loop // // A list of unprocessed node is made. Then, in each // iteration, the list is "taken" and replaced with an // empty list, which is populated with nodes for the next // iteration of the loop. // // Empty list means all nodes were processed. let mut unprocessed_nodes: Vec> = root.children.borrow().iter().cloned().collect(); while !unprocessed_nodes.is_empty() { // "Take" the list out of its memory slot, replace it with an empty list let nodes = std::mem::take(&mut unprocessed_nodes); 'nodes_loop: for node in nodes.into_iter() { // Add children nodes to the list for the next iteration unprocessed_nodes.extend(node.children.borrow().iter().cloned()); if let html5ever::rcdom::NodeData::Element { ref name, ref attrs, .. } = node.data { // If it's not ``, skip it if name.local != *"a" { continue; } let mut is_mention: bool = false; for attr in attrs.borrow().iter() { if attr.name.local == *"rel" { // Don't count `rel="nofollow"` links — a web crawler should ignore them // and so for purposes of driving visitors they are useless if attr.value .as_ref() .split([',', ' ']) .any(|v| v == "nofollow") { // Skip the entire node. continue 'nodes_loop; } } // if it's not ``, skip it if attr.name.local != *"href" { continue; } // Be forgiving in parsing URLs, and resolve them against the base URL if let Ok(url) = base_url.join(attr.value.as_ref()) { if &url == link { is_mention = true; } } } if is_mention { return Ok(Some((MentionType::Mention, serde_json::to_value(&*item).unwrap()))); } } } } } } Ok(None) }