use std::rc::Rc; use microformats::types::PropertyValue; use html5ever::{self, tendril::TendrilSink}; use kittybox_util::MentionType; // TODO: replace. mod rcdom; #[derive(thiserror::Error, Debug)] pub enum Error { #[error("microformats error: {0}")] Microformats(#[from] microformats::Error), // #[error("json error: {0}")] // Json(#[from] serde_json::Error), #[error("url parse error: {0}")] UrlParse(#[from] url::ParseError), } #[tracing::instrument] pub fn check_mention(document: impl AsRef<str> + std::fmt::Debug, base_url: &url::Url, link: &url::Url) -> Result<Option<(MentionType, serde_json::Value)>, Error> { tracing::debug!("Parsing MF2 markup..."); // First, check the document for MF2 markup let document = microformats::from_html(document.as_ref(), base_url.clone())?; // Get an iterator of all items let items_iter = document.items.iter(); for item in items_iter { tracing::debug!("Processing item: {:?}", item); for (prop, interaction_type) in [ ("in-reply-to", MentionType::Reply), ("like-of", MentionType::Like), ("bookmark-of", MentionType::Bookmark), ("repost-of", MentionType::Repost) ] { if let Some(propvals) = item.properties.get(prop) { tracing::debug!("Has a u-{} property", prop); for val in propvals { if let PropertyValue::Url(url) = val { if url == link { tracing::debug!("URL matches! Webmention is valid"); return Ok(Some((interaction_type, serde_json::to_value(&*item).unwrap()))) } } } } } // Process `content` tracing::debug!("Processing e-content..."); if let Some(PropertyValue::Fragment(content)) = item.properties.get("content") .map(Vec::as_slice) .unwrap_or_default() .first() { tracing::debug!("Parsing HTML data..."); let root = html5ever::parse_document(rcdom::RcDom::default(), Default::default()) .from_utf8() .one(content.html.to_owned().as_bytes()) .document; // This is a trick to unwrap recursion into a loop // // A list of unprocessed node is made. Then, in each // iteration, the list is "taken" and replaced with an // empty list, which is populated with nodes for the next // iteration of the loop. // // Empty list means all nodes were processed. let mut unprocessed_nodes: Vec<Rc<rcdom::Node>> = root.children.borrow().iter().cloned().collect(); while !unprocessed_nodes.is_empty() { // "Take" the list out of its memory slot, replace it with an empty list let nodes = std::mem::take(&mut unprocessed_nodes); tracing::debug!("Processing list of {} nodes", nodes.len()); 'nodes_loop: for node in nodes.into_iter() { // Add children nodes to the list for the next iteration unprocessed_nodes.extend(node.children.borrow().iter().cloned()); if let rcdom::NodeData::Element { ref name, ref attrs, .. } = node.data { // If it's not `<a>`, skip it if name.local != *"a" { continue; } let mut is_mention: bool = false; for attr in attrs.borrow().iter() { if attr.name.local == *"rel" { // Don't count `rel="nofollow"` links — a web crawler should ignore them // and so for purposes of driving visitors they are useless if attr.value .as_ref() .split([',', ' ']) .any(|v| v == "nofollow") { // Skip the entire node. continue 'nodes_loop; } } // if it's not `<a href="...">`, skip it if attr.name.local != *"href" { continue; } // Be forgiving in parsing URLs, and resolve them against the base URL if let Ok(url) = base_url.join(attr.value.as_ref()) { if &url == link { is_mention = true; } } } if is_mention { return Ok(Some((MentionType::Mention, serde_json::to_value(&*item).unwrap()))); } } } } } } Ok(None) }