use std::cell::{RefCell, Ref}; use std::rc::Rc; use clap::Parser; use microformats::types::PropertyValue; use microformats::html5ever; use microformats::html5ever::tendril::TendrilSink; #[derive(thiserror::Error, Debug)] enum Error { #[error("http request error: {0}")] Http(#[from] reqwest::Error), #[error("microformats error: {0}")] Microformats(#[from] microformats::Error), #[error("json error: {0}")] Json(#[from] serde_json::Error), #[error("url parse error: {0}")] UrlParse(#[from] url::ParseError), } #[derive(Debug)] enum MentionType { Reply, Like, Repost, Bookmark, Mention } fn check_mention(document: impl AsRef<str>, base_url: &url::Url, link: &url::Url) -> Result<Option<MentionType>, Error> { // First, check the document for MF2 markup let document = microformats::from_html(document.as_ref(), base_url.clone())?; // Get an iterator of all items let items_iter = document.items.iter() .map(AsRef::as_ref) .map(RefCell::borrow); for item in items_iter { let props = item.properties.borrow(); for (prop, interaction_type) in [ ("in-reply-to", MentionType::Reply), ("like-of", MentionType::Like), ("bookmark-of", MentionType::Bookmark), ("repost-of", MentionType::Repost) ] { if let Some(propvals) = props.get(prop) { for val in propvals { if let PropertyValue::Url(url) = val { if url == link { return Ok(Some(interaction_type)) } } } } } // Process `content` if let Some(PropertyValue::Fragment(content)) = props.get("content") .map(Vec::as_slice) .unwrap_or_default() .first() { let root = html5ever::parse_document(html5ever::rcdom::RcDom::default(), Default::default()) .from_utf8() .one(content.html.to_owned().as_bytes()) .document; // This is a trick to unwrap recursion into a loop // // A list of unprocessed node is made. Then, in each // iteration, the list is "taken" and replaced with an // empty list, which is populated with nodes for the next // iteration of the loop. // // Empty list means all nodes were processed. let mut unprocessed_nodes: Vec<Rc<html5ever::rcdom::Node>> = root.children.borrow().iter().cloned().collect(); while unprocessed_nodes.len() > 0 { // "Take" the list out of its memory slot, replace it with an empty list let nodes = std::mem::take(&mut unprocessed_nodes); for node in nodes.into_iter() { // Add children nodes to the list for the next iteration unprocessed_nodes.extend(node.children.borrow().iter().cloned()); if let html5ever::rcdom::NodeData::Element { ref name, ref attrs, .. } = node.data { // If it's not `<a>`, skip it if name.local != *"a" { continue; } let mut is_mention: bool = false; for attr in attrs.borrow().iter() { // if it's not `<a href="...">`, skip it if attr.name.local != *"href" { continue; } if attr.name.local == *"rel" { // Don't count `rel="nofollow"` links — a web crawler should ignore them // and so for purposes of driving visitors they are useless if attr.value.as_ref().split([',', ' ']).any(|v| v == "nofollow") { continue; } } // Be forgiving in parsing URLs, and resolve them against the base URL if let Ok(url) = base_url.join(attr.value.as_ref()) { if &url == link { is_mention = true; } } } if is_mention { return Ok(Some(MentionType::Mention)); } } } } } } Ok(None) } #[derive(Parser, Debug)] #[clap( name = "kittybox-check-webmention", author = "Vika <vika@fireburn.ru>", version = env!("CARGO_PKG_VERSION"), about = "Verify an incoming webmention" )] struct Args { #[clap(value_parser)] url: url::Url, #[clap(value_parser)] link: url::Url } #[tokio::main] async fn main() -> Result<(), self::Error> { let args = Args::parse(); let http: reqwest::Client = { #[allow(unused_mut)] let mut builder = reqwest::Client::builder() .user_agent(concat!( env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION") )); builder.build().unwrap() }; let response = http.get(args.url.clone()).send().await?; let text = response.text().await?; if let Some(mention_type) = check_mention(text, &args.url, &args.link)? { println!("{:?}", mention_type); Ok(()) } else { std::process::exit(1) } }