From 2318a33f9b359ae27b52cd9a19db1f6782d8dae3 Mon Sep 17 00:00:00 2001 From: Vika Date: Thu, 1 Aug 2024 22:50:28 +0300 Subject: Upgrade dependencies and fix deprecated functionality I think I managed to not lose any functionality from my dependencies. sqlparser remains unupgraded, but that's mostly because it is only used in one example and it's not worth it to upgrade right now. --- src/webmentions/check/rcdom.rs | 515 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 515 insertions(+) create mode 100644 src/webmentions/check/rcdom.rs (limited to 'src/webmentions/check') diff --git a/src/webmentions/check/rcdom.rs b/src/webmentions/check/rcdom.rs new file mode 100644 index 0000000..549610f --- /dev/null +++ b/src/webmentions/check/rcdom.rs @@ -0,0 +1,515 @@ +// Copyright 2014-2017 The html5ever Project Developers. +// Copyright Michael Howell and others. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![allow(missing_docs)] + +//! A simple reference-counted DOM. +//! +//! This is sufficient as a static parse tree, but don't build a +//! web browser using it. :) +//! +//! A DOM is a [tree structure] with ordered children that can be represented in an XML-like +//! format. For example, the following graph +//! +//! ```text +//! div +//! +- "text node" +//! +- span +//! ``` +//! in HTML would be serialized as +//! +//! ```html +//!
text node
+//! ``` +//! +//! See the [document object model article on wikipedia][dom wiki] for more information. +//! +//! This implementation stores the information associated with each node once, and then hands out +//! refs to children. The nodes themselves are reference-counted to avoid copying - you can create +//! a new ref and then a node will outlive the document. Nodes own their children, but only have +//! weak references to their parents. +//! +//! [tree structure]: https://en.wikipedia.org/wiki/Tree_(data_structure) +//! [dom wiki]: https://en.wikipedia.org/wiki/Document_Object_Model + +use std::borrow::Cow; +use std::cell::{Cell, RefCell}; +use std::collections::{HashSet, VecDeque}; +use std::default::Default; +use std::fmt; +use std::io; +use std::mem; +use std::rc::{Rc, Weak}; + +use html5ever::tendril::StrTendril; + +use html5ever::interface::tree_builder; +use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; +use html5ever::serialize::TraversalScope; +use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode}; +use html5ever::serialize::{Serialize, Serializer}; +use html5ever::Attribute; +use html5ever::ExpandedName; +use html5ever::QualName; + +/// The different kinds of nodes in the DOM. +#[derive(Debug)] +pub enum NodeData { + /// The `Document` itself - the root node of a HTML document. + Document, + + /// A `DOCTYPE` with name, public id, and system id. See + /// [document type declaration on wikipedia][dtd wiki]. + /// + /// [dtd wiki]: https://en.wikipedia.org/wiki/Document_type_declaration + Doctype { + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril, + }, + + /// A text node. + Text { contents: RefCell }, + + /// A comment. + Comment { contents: StrTendril }, + + /// An element with attributes. + Element { + name: QualName, + attrs: RefCell>, + + /// For HTML \ elements, the [template contents]. + /// + /// [template contents]: https://html.spec.whatwg.org/multipage/#template-contents + template_contents: RefCell>, + + /// Whether the node is a [HTML integration point]. + /// + /// [HTML integration point]: https://html.spec.whatwg.org/multipage/#html-integration-point + mathml_annotation_xml_integration_point: bool, + }, + + /// A Processing instruction. + ProcessingInstruction { + target: StrTendril, + contents: StrTendril, + }, +} + +/// A DOM node. +pub struct Node { + /// Parent node. + pub parent: Cell>, + /// Child nodes of this node. + pub children: RefCell>, + /// Represents this node's data. + pub data: NodeData, +} + +impl Node { + /// Create a new node from its contents + pub fn new(data: NodeData) -> Rc { + Rc::new(Node { + data, + parent: Cell::new(None), + children: RefCell::new(Vec::new()), + }) + } +} + +impl Drop for Node { + fn drop(&mut self) { + let mut nodes = mem::take(&mut *self.children.borrow_mut()); + while let Some(node) = nodes.pop() { + let children = mem::take(&mut *node.children.borrow_mut()); + nodes.extend(children.into_iter()); + if let NodeData::Element { + ref template_contents, + .. + } = node.data + { + if let Some(template_contents) = template_contents.borrow_mut().take() { + nodes.push(template_contents); + } + } + } + } +} + +impl fmt::Debug for Node { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + fmt.debug_struct("Node") + .field("data", &self.data) + .field("children", &self.children) + .finish() + } +} + +/// Reference to a DOM node. +pub type Handle = Rc; + +/// Weak reference to a DOM node, used for parent pointers. +pub type WeakHandle = Weak; + +/// Append a parentless node to another nodes' children +fn append(new_parent: &Handle, child: Handle) { + let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent))); + // Invariant: child cannot have existing parent + assert!(previous_parent.is_none()); + new_parent.children.borrow_mut().push(child); +} + +/// If the node has a parent, get it and this node's position in its children +fn get_parent_and_index(target: &Handle) -> Option<(Handle, usize)> { + if let Some(weak) = target.parent.take() { + let parent = weak.upgrade().expect("dangling weak pointer"); + target.parent.set(Some(weak)); + let i = match parent + .children + .borrow() + .iter() + .enumerate() + .find(|&(_, child)| Rc::ptr_eq(child, target)) + { + Some((i, _)) => i, + None => panic!("have parent but couldn't find in parent's children!"), + }; + Some((parent, i)) + } else { + None + } +} + +fn append_to_existing_text(prev: &Handle, text: &str) -> bool { + match prev.data { + NodeData::Text { ref contents } => { + contents.borrow_mut().push_slice(text); + true + } + _ => false, + } +} + +fn remove_from_parent(target: &Handle) { + if let Some((parent, i)) = get_parent_and_index(target) { + parent.children.borrow_mut().remove(i); + target.parent.set(None); + } +} + +/// The DOM itself; the result of parsing. +pub struct RcDom { + /// The `Document` itself. + pub document: Handle, + + /// Errors that occurred during parsing. + pub errors: Vec>, + + /// The document's quirks mode. + pub quirks_mode: QuirksMode, +} + +impl TreeSink for RcDom { + type Output = Self; + fn finish(self) -> Self { + self + } + + type Handle = Handle; + + fn parse_error(&mut self, msg: Cow<'static, str>) { + self.errors.push(msg); + } + + fn get_document(&mut self) -> Handle { + self.document.clone() + } + + fn get_template_contents(&mut self, target: &Handle) -> Handle { + if let NodeData::Element { + ref template_contents, + .. + } = target.data + { + template_contents + .borrow() + .as_ref() + .expect("not a template element!") + .clone() + } else { + panic!("not a template element!") + } + } + + fn set_quirks_mode(&mut self, mode: QuirksMode) { + self.quirks_mode = mode; + } + + fn same_node(&self, x: &Handle, y: &Handle) -> bool { + Rc::ptr_eq(x, y) + } + + fn elem_name<'a>(&self, target: &'a Handle) -> ExpandedName<'a> { + return match target.data { + NodeData::Element { ref name, .. } => name.expanded(), + _ => panic!("not an element!"), + }; + } + + fn create_element( + &mut self, + name: QualName, + attrs: Vec, + flags: ElementFlags, + ) -> Handle { + Node::new(NodeData::Element { + name, + attrs: RefCell::new(attrs), + template_contents: RefCell::new(if flags.template { + Some(Node::new(NodeData::Document)) + } else { + None + }), + mathml_annotation_xml_integration_point: flags.mathml_annotation_xml_integration_point, + }) + } + + fn create_comment(&mut self, text: StrTendril) -> Handle { + Node::new(NodeData::Comment { contents: text }) + } + + fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Handle { + Node::new(NodeData::ProcessingInstruction { + target, + contents: data, + }) + } + + fn append(&mut self, parent: &Handle, child: NodeOrText) { + // Append to an existing Text node if we have one. + if let NodeOrText::AppendText(ref text) = child { + if let Some(h) = parent.children.borrow().last() { + if append_to_existing_text(h, text) { + return; + } + } + } + + append( + parent, + match child { + NodeOrText::AppendText(text) => Node::new(NodeData::Text { + contents: RefCell::new(text), + }), + NodeOrText::AppendNode(node) => node, + }, + ); + } + + fn append_before_sibling(&mut self, sibling: &Handle, child: NodeOrText) { + let (parent, i) = get_parent_and_index(sibling) + .expect("append_before_sibling called on node without parent"); + + let child = match (child, i) { + // No previous node. + (NodeOrText::AppendText(text), 0) => Node::new(NodeData::Text { + contents: RefCell::new(text), + }), + + // Look for a text node before the insertion point. + (NodeOrText::AppendText(text), i) => { + let children = parent.children.borrow(); + let prev = &children[i - 1]; + if append_to_existing_text(prev, &text) { + return; + } + Node::new(NodeData::Text { + contents: RefCell::new(text), + }) + } + + // The tree builder promises we won't have a text node after + // the insertion point. + + // Any other kind of node. + (NodeOrText::AppendNode(node), _) => node, + }; + + remove_from_parent(&child); + + child.parent.set(Some(Rc::downgrade(&parent))); + parent.children.borrow_mut().insert(i, child); + } + + fn append_based_on_parent_node( + &mut self, + element: &Self::Handle, + prev_element: &Self::Handle, + child: NodeOrText, + ) { + let parent = element.parent.take(); + let has_parent = parent.is_some(); + element.parent.set(parent); + + if has_parent { + self.append_before_sibling(element, child); + } else { + self.append(prev_element, child); + } + } + + fn append_doctype_to_document( + &mut self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril, + ) { + append( + &self.document, + Node::new(NodeData::Doctype { + name, + public_id, + system_id, + }), + ); + } + + fn add_attrs_if_missing(&mut self, target: &Handle, attrs: Vec) { + let mut existing = if let NodeData::Element { ref attrs, .. } = target.data { + attrs.borrow_mut() + } else { + panic!("not an element") + }; + + let existing_names = existing + .iter() + .map(|e| e.name.clone()) + .collect::>(); + existing.extend( + attrs + .into_iter() + .filter(|attr| !existing_names.contains(&attr.name)), + ); + } + + fn remove_from_parent(&mut self, target: &Handle) { + remove_from_parent(target); + } + + fn reparent_children(&mut self, node: &Handle, new_parent: &Handle) { + let mut children = node.children.borrow_mut(); + let mut new_children = new_parent.children.borrow_mut(); + for child in children.iter() { + let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent))); + assert!(Rc::ptr_eq( + node, + &previous_parent.unwrap().upgrade().expect("dangling weak") + )) + } + new_children.extend(mem::take(&mut *children)); + } + + fn is_mathml_annotation_xml_integration_point(&self, target: &Handle) -> bool { + if let NodeData::Element { + mathml_annotation_xml_integration_point, + .. + } = target.data + { + mathml_annotation_xml_integration_point + } else { + panic!("not an element!") + } + } +} + +impl Default for RcDom { + fn default() -> RcDom { + RcDom { + document: Node::new(NodeData::Document), + errors: vec![], + quirks_mode: tree_builder::NoQuirks, + } + } +} + +enum SerializeOp { + Open(Handle), + Close(QualName), +} + +pub struct SerializableHandle(Handle); + +impl From for SerializableHandle { + fn from(h: Handle) -> SerializableHandle { + SerializableHandle(h) + } +} + +impl Serialize for SerializableHandle { + fn serialize(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()> + where + S: Serializer, + { + let mut ops = VecDeque::new(); + match traversal_scope { + IncludeNode => ops.push_back(SerializeOp::Open(self.0.clone())), + ChildrenOnly(_) => ops.extend( + self.0 + .children + .borrow() + .iter() + .map(|h| SerializeOp::Open(h.clone())), + ), + } + + while let Some(op) = ops.pop_front() { + match op { + SerializeOp::Open(handle) => match handle.data { + NodeData::Element { + ref name, + ref attrs, + .. + } => { + serializer.start_elem( + name.clone(), + attrs.borrow().iter().map(|at| (&at.name, &at.value[..])), + )?; + + ops.reserve(1 + handle.children.borrow().len()); + ops.push_front(SerializeOp::Close(name.clone())); + + for child in handle.children.borrow().iter().rev() { + ops.push_front(SerializeOp::Open(child.clone())); + } + } + + NodeData::Doctype { ref name, .. } => serializer.write_doctype(name)?, + + NodeData::Text { ref contents } => serializer.write_text(&contents.borrow())?, + + NodeData::Comment { ref contents } => serializer.write_comment(contents)?, + + NodeData::ProcessingInstruction { + ref target, + ref contents, + } => serializer.write_processing_instruction(target, contents)?, + + NodeData::Document => panic!("Can't serialize Document node itself"), + }, + + SerializeOp::Close(name) => { + serializer.end_elem(name)?; + } + } + } + + Ok(()) + } +} -- cgit 1.4.1