about summary refs log tree commit diff
path: root/src/webmentions
diff options
context:
space:
mode:
Diffstat (limited to 'src/webmentions')
-rw-r--r--src/webmentions/check.rs23
-rw-r--r--src/webmentions/check/rcdom.rs515
2 files changed, 527 insertions, 11 deletions
diff --git a/src/webmentions/check.rs b/src/webmentions/check.rs
index 6dc6a25..178c008 100644
--- a/src/webmentions/check.rs
+++ b/src/webmentions/check.rs
@@ -1,7 +1,11 @@
-use std::{cell::RefCell, rc::Rc};
-use microformats::{types::PropertyValue, html5ever::{self, tendril::TendrilSink}};
+use std::rc::Rc;
+use microformats::types::PropertyValue;
+use html5ever::{self, tendril::TendrilSink};
 use kittybox_util::MentionType;
 
+// TODO: replace.
+mod rcdom;
+
 #[derive(thiserror::Error, Debug)]
 pub enum Error {
     #[error("microformats error: {0}")]
@@ -19,19 +23,16 @@ pub fn check_mention(document: impl AsRef<str> + std::fmt::Debug, base_url: &url
     let document = microformats::from_html(document.as_ref(), base_url.clone())?;
 
     // Get an iterator of all items
-    let items_iter = document.items.iter()
-        .map(AsRef::as_ref)
-        .map(RefCell::borrow);
+    let items_iter = document.items.iter();
 
     for item in items_iter {
         tracing::debug!("Processing item: {:?}", item);
 
-        let props = item.properties.borrow();
         for (prop, interaction_type) in [
             ("in-reply-to", MentionType::Reply), ("like-of", MentionType::Like),
             ("bookmark-of", MentionType::Bookmark), ("repost-of", MentionType::Repost)
         ] {
-            if let Some(propvals) = props.get(prop) {
+            if let Some(propvals) = item.properties.get(prop) {
                 tracing::debug!("Has a u-{} property", prop);
                 for val in propvals {
                     if let PropertyValue::Url(url) = val {
@@ -45,13 +46,13 @@ pub fn check_mention(document: impl AsRef<str> + std::fmt::Debug, base_url: &url
         }
         // Process `content`
         tracing::debug!("Processing e-content...");
-        if let Some(PropertyValue::Fragment(content)) = props.get("content")
+        if let Some(PropertyValue::Fragment(content)) = item.properties.get("content")
             .map(Vec::as_slice)
             .unwrap_or_default()
             .first()
         {
             tracing::debug!("Parsing HTML data...");
-            let root = html5ever::parse_document(html5ever::rcdom::RcDom::default(), Default::default())
+            let root = html5ever::parse_document(rcdom::RcDom::default(), Default::default())
                 .from_utf8()
                 .one(content.html.to_owned().as_bytes())
                 .document;
@@ -64,7 +65,7 @@ pub fn check_mention(document: impl AsRef<str> + std::fmt::Debug, base_url: &url
             // iteration of the loop.
             //
             // Empty list means all nodes were processed.
-            let mut unprocessed_nodes: Vec<Rc<html5ever::rcdom::Node>> = root.children.borrow().iter().cloned().collect();
+            let mut unprocessed_nodes: Vec<Rc<rcdom::Node>> = root.children.borrow().iter().cloned().collect();
             while !unprocessed_nodes.is_empty() {
                 // "Take" the list out of its memory slot, replace it with an empty list
                 let nodes = std::mem::take(&mut unprocessed_nodes);
@@ -73,7 +74,7 @@ pub fn check_mention(document: impl AsRef<str> + std::fmt::Debug, base_url: &url
                     // Add children nodes to the list for the next iteration
                     unprocessed_nodes.extend(node.children.borrow().iter().cloned());
 
-                    if let html5ever::rcdom::NodeData::Element { ref name, ref attrs, .. } = node.data {
+                    if let rcdom::NodeData::Element { ref name, ref attrs, .. } = node.data {
                         // If it's not `<a>`, skip it
                         if name.local != *"a" { continue; }
                         let mut is_mention: bool = false;
diff --git a/src/webmentions/check/rcdom.rs b/src/webmentions/check/rcdom.rs
new file mode 100644
index 0000000..549610f
--- /dev/null
+++ b/src/webmentions/check/rcdom.rs
@@ -0,0 +1,515 @@
+// Copyright 2014-2017 The html5ever Project Developers.
+// Copyright Michael Howell and others.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#![allow(missing_docs)]
+
+//! A simple reference-counted DOM.
+//!
+//! This is sufficient as a static parse tree, but don't build a
+//! web browser using it. :)
+//!
+//! A DOM is a [tree structure] with ordered children that can be represented in an XML-like
+//! format. For example, the following graph
+//!
+//! ```text
+//! div
+//!  +- "text node"
+//!  +- span
+//! ```
+//! in HTML would be serialized as
+//!
+//! ```html
+//! <div>text node<span></span></div>
+//! ```
+//!
+//! See the [document object model article on wikipedia][dom wiki] for more information.
+//!
+//! This implementation stores the information associated with each node once, and then hands out
+//! refs to children. The nodes themselves are reference-counted to avoid copying - you can create
+//! a new ref and then a node will outlive the document. Nodes own their children, but only have
+//! weak references to their parents.
+//!
+//! [tree structure]: https://en.wikipedia.org/wiki/Tree_(data_structure)
+//! [dom wiki]: https://en.wikipedia.org/wiki/Document_Object_Model
+
+use std::borrow::Cow;
+use std::cell::{Cell, RefCell};
+use std::collections::{HashSet, VecDeque};
+use std::default::Default;
+use std::fmt;
+use std::io;
+use std::mem;
+use std::rc::{Rc, Weak};
+
+use html5ever::tendril::StrTendril;
+
+use html5ever::interface::tree_builder;
+use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
+use html5ever::serialize::TraversalScope;
+use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
+use html5ever::serialize::{Serialize, Serializer};
+use html5ever::Attribute;
+use html5ever::ExpandedName;
+use html5ever::QualName;
+
+/// The different kinds of nodes in the DOM.
+#[derive(Debug)]
+pub enum NodeData {
+    /// The `Document` itself - the root node of a HTML document.
+    Document,
+
+    /// A `DOCTYPE` with name, public id, and system id. See
+    /// [document type declaration on wikipedia][dtd wiki].
+    ///
+    /// [dtd wiki]: https://en.wikipedia.org/wiki/Document_type_declaration
+    Doctype {
+        name: StrTendril,
+        public_id: StrTendril,
+        system_id: StrTendril,
+    },
+
+    /// A text node.
+    Text { contents: RefCell<StrTendril> },
+
+    /// A comment.
+    Comment { contents: StrTendril },
+
+    /// An element with attributes.
+    Element {
+        name: QualName,
+        attrs: RefCell<Vec<Attribute>>,
+
+        /// For HTML \<template\> elements, the [template contents].
+        ///
+        /// [template contents]: https://html.spec.whatwg.org/multipage/#template-contents
+        template_contents: RefCell<Option<Handle>>,
+
+        /// Whether the node is a [HTML integration point].
+        ///
+        /// [HTML integration point]: https://html.spec.whatwg.org/multipage/#html-integration-point
+        mathml_annotation_xml_integration_point: bool,
+    },
+
+    /// A Processing instruction.
+    ProcessingInstruction {
+        target: StrTendril,
+        contents: StrTendril,
+    },
+}
+
+/// A DOM node.
+pub struct Node {
+    /// Parent node.
+    pub parent: Cell<Option<WeakHandle>>,
+    /// Child nodes of this node.
+    pub children: RefCell<Vec<Handle>>,
+    /// Represents this node's data.
+    pub data: NodeData,
+}
+
+impl Node {
+    /// Create a new node from its contents
+    pub fn new(data: NodeData) -> Rc<Self> {
+        Rc::new(Node {
+            data,
+            parent: Cell::new(None),
+            children: RefCell::new(Vec::new()),
+        })
+    }
+}
+
+impl Drop for Node {
+    fn drop(&mut self) {
+        let mut nodes = mem::take(&mut *self.children.borrow_mut());
+        while let Some(node) = nodes.pop() {
+            let children = mem::take(&mut *node.children.borrow_mut());
+            nodes.extend(children.into_iter());
+            if let NodeData::Element {
+                ref template_contents,
+                ..
+            } = node.data
+            {
+                if let Some(template_contents) = template_contents.borrow_mut().take() {
+                    nodes.push(template_contents);
+                }
+            }
+        }
+    }
+}
+
+impl fmt::Debug for Node {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        fmt.debug_struct("Node")
+            .field("data", &self.data)
+            .field("children", &self.children)
+            .finish()
+    }
+}
+
+/// Reference to a DOM node.
+pub type Handle = Rc<Node>;
+
+/// Weak reference to a DOM node, used for parent pointers.
+pub type WeakHandle = Weak<Node>;
+
+/// Append a parentless node to another nodes' children
+fn append(new_parent: &Handle, child: Handle) {
+    let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent)));
+    // Invariant: child cannot have existing parent
+    assert!(previous_parent.is_none());
+    new_parent.children.borrow_mut().push(child);
+}
+
+/// If the node has a parent, get it and this node's position in its children
+fn get_parent_and_index(target: &Handle) -> Option<(Handle, usize)> {
+    if let Some(weak) = target.parent.take() {
+        let parent = weak.upgrade().expect("dangling weak pointer");
+        target.parent.set(Some(weak));
+        let i = match parent
+            .children
+            .borrow()
+            .iter()
+            .enumerate()
+            .find(|&(_, child)| Rc::ptr_eq(child, target))
+        {
+            Some((i, _)) => i,
+            None => panic!("have parent but couldn't find in parent's children!"),
+        };
+        Some((parent, i))
+    } else {
+        None
+    }
+}
+
+fn append_to_existing_text(prev: &Handle, text: &str) -> bool {
+    match prev.data {
+        NodeData::Text { ref contents } => {
+            contents.borrow_mut().push_slice(text);
+            true
+        }
+        _ => false,
+    }
+}
+
+fn remove_from_parent(target: &Handle) {
+    if let Some((parent, i)) = get_parent_and_index(target) {
+        parent.children.borrow_mut().remove(i);
+        target.parent.set(None);
+    }
+}
+
+/// The DOM itself; the result of parsing.
+pub struct RcDom {
+    /// The `Document` itself.
+    pub document: Handle,
+
+    /// Errors that occurred during parsing.
+    pub errors: Vec<Cow<'static, str>>,
+
+    /// The document's quirks mode.
+    pub quirks_mode: QuirksMode,
+}
+
+impl TreeSink for RcDom {
+    type Output = Self;
+    fn finish(self) -> Self {
+        self
+    }
+
+    type Handle = Handle;
+
+    fn parse_error(&mut self, msg: Cow<'static, str>) {
+        self.errors.push(msg);
+    }
+
+    fn get_document(&mut self) -> Handle {
+        self.document.clone()
+    }
+
+    fn get_template_contents(&mut self, target: &Handle) -> Handle {
+        if let NodeData::Element {
+            ref template_contents,
+            ..
+        } = target.data
+        {
+            template_contents
+                .borrow()
+                .as_ref()
+                .expect("not a template element!")
+                .clone()
+        } else {
+            panic!("not a template element!")
+        }
+    }
+
+    fn set_quirks_mode(&mut self, mode: QuirksMode) {
+        self.quirks_mode = mode;
+    }
+
+    fn same_node(&self, x: &Handle, y: &Handle) -> bool {
+        Rc::ptr_eq(x, y)
+    }
+
+    fn elem_name<'a>(&self, target: &'a Handle) -> ExpandedName<'a> {
+        return match target.data {
+            NodeData::Element { ref name, .. } => name.expanded(),
+            _ => panic!("not an element!"),
+        };
+    }
+
+    fn create_element(
+        &mut self,
+        name: QualName,
+        attrs: Vec<Attribute>,
+        flags: ElementFlags,
+    ) -> Handle {
+        Node::new(NodeData::Element {
+            name,
+            attrs: RefCell::new(attrs),
+            template_contents: RefCell::new(if flags.template {
+                Some(Node::new(NodeData::Document))
+            } else {
+                None
+            }),
+            mathml_annotation_xml_integration_point: flags.mathml_annotation_xml_integration_point,
+        })
+    }
+
+    fn create_comment(&mut self, text: StrTendril) -> Handle {
+        Node::new(NodeData::Comment { contents: text })
+    }
+
+    fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Handle {
+        Node::new(NodeData::ProcessingInstruction {
+            target,
+            contents: data,
+        })
+    }
+
+    fn append(&mut self, parent: &Handle, child: NodeOrText<Handle>) {
+        // Append to an existing Text node if we have one.
+        if let NodeOrText::AppendText(ref text) = child {
+            if let Some(h) = parent.children.borrow().last() {
+                if append_to_existing_text(h, text) {
+                    return;
+                }
+            }
+        }
+
+        append(
+            parent,
+            match child {
+                NodeOrText::AppendText(text) => Node::new(NodeData::Text {
+                    contents: RefCell::new(text),
+                }),
+                NodeOrText::AppendNode(node) => node,
+            },
+        );
+    }
+
+    fn append_before_sibling(&mut self, sibling: &Handle, child: NodeOrText<Handle>) {
+        let (parent, i) = get_parent_and_index(sibling)
+            .expect("append_before_sibling called on node without parent");
+
+        let child = match (child, i) {
+            // No previous node.
+            (NodeOrText::AppendText(text), 0) => Node::new(NodeData::Text {
+                contents: RefCell::new(text),
+            }),
+
+            // Look for a text node before the insertion point.
+            (NodeOrText::AppendText(text), i) => {
+                let children = parent.children.borrow();
+                let prev = &children[i - 1];
+                if append_to_existing_text(prev, &text) {
+                    return;
+                }
+                Node::new(NodeData::Text {
+                    contents: RefCell::new(text),
+                })
+            }
+
+            // The tree builder promises we won't have a text node after
+            // the insertion point.
+
+            // Any other kind of node.
+            (NodeOrText::AppendNode(node), _) => node,
+        };
+
+        remove_from_parent(&child);
+
+        child.parent.set(Some(Rc::downgrade(&parent)));
+        parent.children.borrow_mut().insert(i, child);
+    }
+
+    fn append_based_on_parent_node(
+        &mut self,
+        element: &Self::Handle,
+        prev_element: &Self::Handle,
+        child: NodeOrText<Self::Handle>,
+    ) {
+        let parent = element.parent.take();
+        let has_parent = parent.is_some();
+        element.parent.set(parent);
+
+        if has_parent {
+            self.append_before_sibling(element, child);
+        } else {
+            self.append(prev_element, child);
+        }
+    }
+
+    fn append_doctype_to_document(
+        &mut self,
+        name: StrTendril,
+        public_id: StrTendril,
+        system_id: StrTendril,
+    ) {
+        append(
+            &self.document,
+            Node::new(NodeData::Doctype {
+                name,
+                public_id,
+                system_id,
+            }),
+        );
+    }
+
+    fn add_attrs_if_missing(&mut self, target: &Handle, attrs: Vec<Attribute>) {
+        let mut existing = if let NodeData::Element { ref attrs, .. } = target.data {
+            attrs.borrow_mut()
+        } else {
+            panic!("not an element")
+        };
+
+        let existing_names = existing
+            .iter()
+            .map(|e| e.name.clone())
+            .collect::<HashSet<_>>();
+        existing.extend(
+            attrs
+                .into_iter()
+                .filter(|attr| !existing_names.contains(&attr.name)),
+        );
+    }
+
+    fn remove_from_parent(&mut self, target: &Handle) {
+        remove_from_parent(target);
+    }
+
+    fn reparent_children(&mut self, node: &Handle, new_parent: &Handle) {
+        let mut children = node.children.borrow_mut();
+        let mut new_children = new_parent.children.borrow_mut();
+        for child in children.iter() {
+            let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent)));
+            assert!(Rc::ptr_eq(
+                node,
+                &previous_parent.unwrap().upgrade().expect("dangling weak")
+            ))
+        }
+        new_children.extend(mem::take(&mut *children));
+    }
+
+    fn is_mathml_annotation_xml_integration_point(&self, target: &Handle) -> bool {
+        if let NodeData::Element {
+            mathml_annotation_xml_integration_point,
+            ..
+        } = target.data
+        {
+            mathml_annotation_xml_integration_point
+        } else {
+            panic!("not an element!")
+        }
+    }
+}
+
+impl Default for RcDom {
+    fn default() -> RcDom {
+        RcDom {
+            document: Node::new(NodeData::Document),
+            errors: vec![],
+            quirks_mode: tree_builder::NoQuirks,
+        }
+    }
+}
+
+enum SerializeOp {
+    Open(Handle),
+    Close(QualName),
+}
+
+pub struct SerializableHandle(Handle);
+
+impl From<Handle> for SerializableHandle {
+    fn from(h: Handle) -> SerializableHandle {
+        SerializableHandle(h)
+    }
+}
+
+impl Serialize for SerializableHandle {
+    fn serialize<S>(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()>
+    where
+        S: Serializer,
+    {
+        let mut ops = VecDeque::new();
+        match traversal_scope {
+            IncludeNode => ops.push_back(SerializeOp::Open(self.0.clone())),
+            ChildrenOnly(_) => ops.extend(
+                self.0
+                    .children
+                    .borrow()
+                    .iter()
+                    .map(|h| SerializeOp::Open(h.clone())),
+            ),
+        }
+
+        while let Some(op) = ops.pop_front() {
+            match op {
+                SerializeOp::Open(handle) => match handle.data {
+                    NodeData::Element {
+                        ref name,
+                        ref attrs,
+                        ..
+                    } => {
+                        serializer.start_elem(
+                            name.clone(),
+                            attrs.borrow().iter().map(|at| (&at.name, &at.value[..])),
+                        )?;
+
+                        ops.reserve(1 + handle.children.borrow().len());
+                        ops.push_front(SerializeOp::Close(name.clone()));
+
+                        for child in handle.children.borrow().iter().rev() {
+                            ops.push_front(SerializeOp::Open(child.clone()));
+                        }
+                    }
+
+                    NodeData::Doctype { ref name, .. } => serializer.write_doctype(name)?,
+
+                    NodeData::Text { ref contents } => serializer.write_text(&contents.borrow())?,
+
+                    NodeData::Comment { ref contents } => serializer.write_comment(contents)?,
+
+                    NodeData::ProcessingInstruction {
+                        ref target,
+                        ref contents,
+                    } => serializer.write_processing_instruction(target, contents)?,
+
+                    NodeData::Document => panic!("Can't serialize Document node itself"),
+                },
+
+                SerializeOp::Close(name) => {
+                    serializer.end_elem(name)?;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}