From 2318a33f9b359ae27b52cd9a19db1f6782d8dae3 Mon Sep 17 00:00:00 2001 From: Vika Date: Thu, 1 Aug 2024 22:50:28 +0300 Subject: Upgrade dependencies and fix deprecated functionality I think I managed to not lose any functionality from my dependencies. sqlparser remains unupgraded, but that's mostly because it is only used in one example and it's not worth it to upgrade right now. --- src/webmentions/check.rs | 23 +- src/webmentions/check/rcdom.rs | 515 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 527 insertions(+), 11 deletions(-) create mode 100644 src/webmentions/check/rcdom.rs (limited to 'src/webmentions') diff --git a/src/webmentions/check.rs b/src/webmentions/check.rs index 6dc6a25..178c008 100644 --- a/src/webmentions/check.rs +++ b/src/webmentions/check.rs @@ -1,7 +1,11 @@ -use std::{cell::RefCell, rc::Rc}; -use microformats::{types::PropertyValue, html5ever::{self, tendril::TendrilSink}}; +use std::rc::Rc; +use microformats::types::PropertyValue; +use html5ever::{self, tendril::TendrilSink}; use kittybox_util::MentionType; +// TODO: replace. +mod rcdom; + #[derive(thiserror::Error, Debug)] pub enum Error { #[error("microformats error: {0}")] @@ -19,19 +23,16 @@ pub fn check_mention(document: impl AsRef + std::fmt::Debug, base_url: &url let document = microformats::from_html(document.as_ref(), base_url.clone())?; // Get an iterator of all items - let items_iter = document.items.iter() - .map(AsRef::as_ref) - .map(RefCell::borrow); + let items_iter = document.items.iter(); for item in items_iter { tracing::debug!("Processing item: {:?}", item); - let props = item.properties.borrow(); for (prop, interaction_type) in [ ("in-reply-to", MentionType::Reply), ("like-of", MentionType::Like), ("bookmark-of", MentionType::Bookmark), ("repost-of", MentionType::Repost) ] { - if let Some(propvals) = props.get(prop) { + if let Some(propvals) = item.properties.get(prop) { tracing::debug!("Has a u-{} property", prop); for val in propvals { if let PropertyValue::Url(url) = val { @@ -45,13 +46,13 @@ pub fn check_mention(document: impl AsRef + std::fmt::Debug, base_url: &url } // Process `content` tracing::debug!("Processing e-content..."); - if let Some(PropertyValue::Fragment(content)) = props.get("content") + if let Some(PropertyValue::Fragment(content)) = item.properties.get("content") .map(Vec::as_slice) .unwrap_or_default() .first() { tracing::debug!("Parsing HTML data..."); - let root = html5ever::parse_document(html5ever::rcdom::RcDom::default(), Default::default()) + let root = html5ever::parse_document(rcdom::RcDom::default(), Default::default()) .from_utf8() .one(content.html.to_owned().as_bytes()) .document; @@ -64,7 +65,7 @@ pub fn check_mention(document: impl AsRef + std::fmt::Debug, base_url: &url // iteration of the loop. // // Empty list means all nodes were processed. - let mut unprocessed_nodes: Vec> = root.children.borrow().iter().cloned().collect(); + let mut unprocessed_nodes: Vec> = root.children.borrow().iter().cloned().collect(); while !unprocessed_nodes.is_empty() { // "Take" the list out of its memory slot, replace it with an empty list let nodes = std::mem::take(&mut unprocessed_nodes); @@ -73,7 +74,7 @@ pub fn check_mention(document: impl AsRef + std::fmt::Debug, base_url: &url // Add children nodes to the list for the next iteration unprocessed_nodes.extend(node.children.borrow().iter().cloned()); - if let html5ever::rcdom::NodeData::Element { ref name, ref attrs, .. } = node.data { + if let rcdom::NodeData::Element { ref name, ref attrs, .. } = node.data { // If it's not ``, skip it if name.local != *"a" { continue; } let mut is_mention: bool = false; diff --git a/src/webmentions/check/rcdom.rs b/src/webmentions/check/rcdom.rs new file mode 100644 index 0000000..549610f --- /dev/null +++ b/src/webmentions/check/rcdom.rs @@ -0,0 +1,515 @@ +// Copyright 2014-2017 The html5ever Project Developers. +// Copyright Michael Howell and others. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![allow(missing_docs)] + +//! A simple reference-counted DOM. +//! +//! This is sufficient as a static parse tree, but don't build a +//! web browser using it. :) +//! +//! A DOM is a [tree structure] with ordered children that can be represented in an XML-like +//! format. For example, the following graph +//! +//! ```text +//! div +//! +- "text node" +//! +- span +//! ``` +//! in HTML would be serialized as +//! +//! ```html +//!
text node
+//! ``` +//! +//! See the [document object model article on wikipedia][dom wiki] for more information. +//! +//! This implementation stores the information associated with each node once, and then hands out +//! refs to children. The nodes themselves are reference-counted to avoid copying - you can create +//! a new ref and then a node will outlive the document. Nodes own their children, but only have +//! weak references to their parents. +//! +//! [tree structure]: https://en.wikipedia.org/wiki/Tree_(data_structure) +//! [dom wiki]: https://en.wikipedia.org/wiki/Document_Object_Model + +use std::borrow::Cow; +use std::cell::{Cell, RefCell}; +use std::collections::{HashSet, VecDeque}; +use std::default::Default; +use std::fmt; +use std::io; +use std::mem; +use std::rc::{Rc, Weak}; + +use html5ever::tendril::StrTendril; + +use html5ever::interface::tree_builder; +use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; +use html5ever::serialize::TraversalScope; +use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode}; +use html5ever::serialize::{Serialize, Serializer}; +use html5ever::Attribute; +use html5ever::ExpandedName; +use html5ever::QualName; + +/// The different kinds of nodes in the DOM. +#[derive(Debug)] +pub enum NodeData { + /// The `Document` itself - the root node of a HTML document. + Document, + + /// A `DOCTYPE` with name, public id, and system id. See + /// [document type declaration on wikipedia][dtd wiki]. + /// + /// [dtd wiki]: https://en.wikipedia.org/wiki/Document_type_declaration + Doctype { + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril, + }, + + /// A text node. + Text { contents: RefCell }, + + /// A comment. + Comment { contents: StrTendril }, + + /// An element with attributes. + Element { + name: QualName, + attrs: RefCell>, + + /// For HTML \ elements, the [template contents]. + /// + /// [template contents]: https://html.spec.whatwg.org/multipage/#template-contents + template_contents: RefCell>, + + /// Whether the node is a [HTML integration point]. + /// + /// [HTML integration point]: https://html.spec.whatwg.org/multipage/#html-integration-point + mathml_annotation_xml_integration_point: bool, + }, + + /// A Processing instruction. + ProcessingInstruction { + target: StrTendril, + contents: StrTendril, + }, +} + +/// A DOM node. +pub struct Node { + /// Parent node. + pub parent: Cell>, + /// Child nodes of this node. + pub children: RefCell>, + /// Represents this node's data. + pub data: NodeData, +} + +impl Node { + /// Create a new node from its contents + pub fn new(data: NodeData) -> Rc { + Rc::new(Node { + data, + parent: Cell::new(None), + children: RefCell::new(Vec::new()), + }) + } +} + +impl Drop for Node { + fn drop(&mut self) { + let mut nodes = mem::take(&mut *self.children.borrow_mut()); + while let Some(node) = nodes.pop() { + let children = mem::take(&mut *node.children.borrow_mut()); + nodes.extend(children.into_iter()); + if let NodeData::Element { + ref template_contents, + .. + } = node.data + { + if let Some(template_contents) = template_contents.borrow_mut().take() { + nodes.push(template_contents); + } + } + } + } +} + +impl fmt::Debug for Node { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + fmt.debug_struct("Node") + .field("data", &self.data) + .field("children", &self.children) + .finish() + } +} + +/// Reference to a DOM node. +pub type Handle = Rc; + +/// Weak reference to a DOM node, used for parent pointers. +pub type WeakHandle = Weak; + +/// Append a parentless node to another nodes' children +fn append(new_parent: &Handle, child: Handle) { + let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent))); + // Invariant: child cannot have existing parent + assert!(previous_parent.is_none()); + new_parent.children.borrow_mut().push(child); +} + +/// If the node has a parent, get it and this node's position in its children +fn get_parent_and_index(target: &Handle) -> Option<(Handle, usize)> { + if let Some(weak) = target.parent.take() { + let parent = weak.upgrade().expect("dangling weak pointer"); + target.parent.set(Some(weak)); + let i = match parent + .children + .borrow() + .iter() + .enumerate() + .find(|&(_, child)| Rc::ptr_eq(child, target)) + { + Some((i, _)) => i, + None => panic!("have parent but couldn't find in parent's children!"), + }; + Some((parent, i)) + } else { + None + } +} + +fn append_to_existing_text(prev: &Handle, text: &str) -> bool { + match prev.data { + NodeData::Text { ref contents } => { + contents.borrow_mut().push_slice(text); + true + } + _ => false, + } +} + +fn remove_from_parent(target: &Handle) { + if let Some((parent, i)) = get_parent_and_index(target) { + parent.children.borrow_mut().remove(i); + target.parent.set(None); + } +} + +/// The DOM itself; the result of parsing. +pub struct RcDom { + /// The `Document` itself. + pub document: Handle, + + /// Errors that occurred during parsing. + pub errors: Vec>, + + /// The document's quirks mode. + pub quirks_mode: QuirksMode, +} + +impl TreeSink for RcDom { + type Output = Self; + fn finish(self) -> Self { + self + } + + type Handle = Handle; + + fn parse_error(&mut self, msg: Cow<'static, str>) { + self.errors.push(msg); + } + + fn get_document(&mut self) -> Handle { + self.document.clone() + } + + fn get_template_contents(&mut self, target: &Handle) -> Handle { + if let NodeData::Element { + ref template_contents, + .. + } = target.data + { + template_contents + .borrow() + .as_ref() + .expect("not a template element!") + .clone() + } else { + panic!("not a template element!") + } + } + + fn set_quirks_mode(&mut self, mode: QuirksMode) { + self.quirks_mode = mode; + } + + fn same_node(&self, x: &Handle, y: &Handle) -> bool { + Rc::ptr_eq(x, y) + } + + fn elem_name<'a>(&self, target: &'a Handle) -> ExpandedName<'a> { + return match target.data { + NodeData::Element { ref name, .. } => name.expanded(), + _ => panic!("not an element!"), + }; + } + + fn create_element( + &mut self, + name: QualName, + attrs: Vec, + flags: ElementFlags, + ) -> Handle { + Node::new(NodeData::Element { + name, + attrs: RefCell::new(attrs), + template_contents: RefCell::new(if flags.template { + Some(Node::new(NodeData::Document)) + } else { + None + }), + mathml_annotation_xml_integration_point: flags.mathml_annotation_xml_integration_point, + }) + } + + fn create_comment(&mut self, text: StrTendril) -> Handle { + Node::new(NodeData::Comment { contents: text }) + } + + fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Handle { + Node::new(NodeData::ProcessingInstruction { + target, + contents: data, + }) + } + + fn append(&mut self, parent: &Handle, child: NodeOrText) { + // Append to an existing Text node if we have one. + if let NodeOrText::AppendText(ref text) = child { + if let Some(h) = parent.children.borrow().last() { + if append_to_existing_text(h, text) { + return; + } + } + } + + append( + parent, + match child { + NodeOrText::AppendText(text) => Node::new(NodeData::Text { + contents: RefCell::new(text), + }), + NodeOrText::AppendNode(node) => node, + }, + ); + } + + fn append_before_sibling(&mut self, sibling: &Handle, child: NodeOrText) { + let (parent, i) = get_parent_and_index(sibling) + .expect("append_before_sibling called on node without parent"); + + let child = match (child, i) { + // No previous node. + (NodeOrText::AppendText(text), 0) => Node::new(NodeData::Text { + contents: RefCell::new(text), + }), + + // Look for a text node before the insertion point. + (NodeOrText::AppendText(text), i) => { + let children = parent.children.borrow(); + let prev = &children[i - 1]; + if append_to_existing_text(prev, &text) { + return; + } + Node::new(NodeData::Text { + contents: RefCell::new(text), + }) + } + + // The tree builder promises we won't have a text node after + // the insertion point. + + // Any other kind of node. + (NodeOrText::AppendNode(node), _) => node, + }; + + remove_from_parent(&child); + + child.parent.set(Some(Rc::downgrade(&parent))); + parent.children.borrow_mut().insert(i, child); + } + + fn append_based_on_parent_node( + &mut self, + element: &Self::Handle, + prev_element: &Self::Handle, + child: NodeOrText, + ) { + let parent = element.parent.take(); + let has_parent = parent.is_some(); + element.parent.set(parent); + + if has_parent { + self.append_before_sibling(element, child); + } else { + self.append(prev_element, child); + } + } + + fn append_doctype_to_document( + &mut self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril, + ) { + append( + &self.document, + Node::new(NodeData::Doctype { + name, + public_id, + system_id, + }), + ); + } + + fn add_attrs_if_missing(&mut self, target: &Handle, attrs: Vec) { + let mut existing = if let NodeData::Element { ref attrs, .. } = target.data { + attrs.borrow_mut() + } else { + panic!("not an element") + }; + + let existing_names = existing + .iter() + .map(|e| e.name.clone()) + .collect::>(); + existing.extend( + attrs + .into_iter() + .filter(|attr| !existing_names.contains(&attr.name)), + ); + } + + fn remove_from_parent(&mut self, target: &Handle) { + remove_from_parent(target); + } + + fn reparent_children(&mut self, node: &Handle, new_parent: &Handle) { + let mut children = node.children.borrow_mut(); + let mut new_children = new_parent.children.borrow_mut(); + for child in children.iter() { + let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent))); + assert!(Rc::ptr_eq( + node, + &previous_parent.unwrap().upgrade().expect("dangling weak") + )) + } + new_children.extend(mem::take(&mut *children)); + } + + fn is_mathml_annotation_xml_integration_point(&self, target: &Handle) -> bool { + if let NodeData::Element { + mathml_annotation_xml_integration_point, + .. + } = target.data + { + mathml_annotation_xml_integration_point + } else { + panic!("not an element!") + } + } +} + +impl Default for RcDom { + fn default() -> RcDom { + RcDom { + document: Node::new(NodeData::Document), + errors: vec![], + quirks_mode: tree_builder::NoQuirks, + } + } +} + +enum SerializeOp { + Open(Handle), + Close(QualName), +} + +pub struct SerializableHandle(Handle); + +impl From for SerializableHandle { + fn from(h: Handle) -> SerializableHandle { + SerializableHandle(h) + } +} + +impl Serialize for SerializableHandle { + fn serialize(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()> + where + S: Serializer, + { + let mut ops = VecDeque::new(); + match traversal_scope { + IncludeNode => ops.push_back(SerializeOp::Open(self.0.clone())), + ChildrenOnly(_) => ops.extend( + self.0 + .children + .borrow() + .iter() + .map(|h| SerializeOp::Open(h.clone())), + ), + } + + while let Some(op) = ops.pop_front() { + match op { + SerializeOp::Open(handle) => match handle.data { + NodeData::Element { + ref name, + ref attrs, + .. + } => { + serializer.start_elem( + name.clone(), + attrs.borrow().iter().map(|at| (&at.name, &at.value[..])), + )?; + + ops.reserve(1 + handle.children.borrow().len()); + ops.push_front(SerializeOp::Close(name.clone())); + + for child in handle.children.borrow().iter().rev() { + ops.push_front(SerializeOp::Open(child.clone())); + } + } + + NodeData::Doctype { ref name, .. } => serializer.write_doctype(name)?, + + NodeData::Text { ref contents } => serializer.write_text(&contents.borrow())?, + + NodeData::Comment { ref contents } => serializer.write_comment(contents)?, + + NodeData::ProcessingInstruction { + ref target, + ref contents, + } => serializer.write_processing_instruction(target, contents)?, + + NodeData::Document => panic!("Can't serialize Document node itself"), + }, + + SerializeOp::Close(name) => { + serializer.end_elem(name)?; + } + } + } + + Ok(()) + } +} -- cgit 1.4.1