Remove beautiful soup dependency (#1453)

* WIP replace beautiful soup with custom HTML util * Format code with black * Remove unused imports * Remove beautifulsoup, bleach and htmlmin * Bump version to 0.29 * Remove BS4 import * Fix function call * Tweak HTML parser * Tweak parser
rafalp · Feb 10, 2023 · 161da7d · 161da7d
1 parent 80b5398
commit 161da7d
Show file tree

Hide file tree

Showing 33 changed files with 711 additions and 367 deletions.
diff --git a/misago/__init__.py b/misago/__init__.py
@@ -1,5 +1,5 @@
 from .plugins.pluginlist import load_plugin_list_if_exists
 
 
-__version__ = "0.28.2"
+__version__ = "0.29.0"
 __released__ = True
diff --git a/misago/conf/defaults.py b/misago/conf/defaults.py
@@ -36,11 +36,6 @@
 MISAGO_MARKUP_EXTENSIONS = []
 
 
-# Bleach callbacks for linkifying paragraphs
-
-MISAGO_BLEACH_CALLBACKS = []
-
-
 # Custom post validators
 
 MISAGO_POST_VALIDATORS = []

diff --git a/misago/markup/htmlparser.py b/misago/markup/htmlparser.py
@@ -0,0 +1,110 @@
+import html
+from dataclasses import dataclass
+
+import html5lib
+
+SINGLETON_TAGS = (
+    "area",
+    "base",
+    "br",
+    "col",
+    "command",
+    "embed",
+    "hr",
+    "img",
+    "input",
+    "keygen",
+    "link",
+    "meta",
+    "param",
+    "source",
+    "track",
+    "wbr",
+)
+
+
+class Node:
+    def __str__(self):
+        raise NotImplementedError("Subclasses of 'Node' need to implement __str__")
+
+
+@dataclass
+class RootNode(Node):
+    tag = None
+    children: list
+
+    def __str__(self):
+        return "".join(str(child) for child in self.children)
+
+
+@dataclass
+class ElementNode(Node):
+    tag: str
+    attrs: dict
+    children: list
+
+    def __str__(self):
+        attrs_padding = " " if self.attrs else ""
+        attrs = " ".join(self.attrs_str())
+
+        if self.tag in SINGLETON_TAGS:
+            return f"<{self.tag}{attrs_padding}{attrs} />"
+
+        children = "".join(str(child) for child in self.children)
+        return f"<{self.tag}{attrs_padding}{attrs}>{children}</{self.tag}>"
+
+    def attrs_str(self):
+        for name, value in self.attrs.items():
+            if value is True or not value:
+                yield html.escape(str(name))
+            else:
+                yield (f'{html.escape(str(name))}="{html.escape(str(value))}"')
+
+
+@dataclass
+class TextNode(Node):
+    text: str
+
+    def __str__(self):
+        return html.escape(self.text)
+
+
+def parse_html_string(string: str) -> RootNode:
+    element = html5lib.parse(
+        string,
+        namespaceHTMLElements=False,
+    )
+
+    body = element.find("body")
+    root_node = RootNode(children=[])
+
+    if body.text:
+        root_node.children.append(TextNode(text=body.text))
+
+    for child in body:
+        add_child_node(root_node, child)
+
+    return root_node
+
+
+def add_child_node(parent, element):
+    node = ElementNode(
+        tag=element.tag,
+        attrs=element.attrib,
+        children=[],
+    )
+
+    if element.text:
+        node.children.append(TextNode(text=element.text))
+
+    parent.children.append(node)
+
+    if element.tail:
+        parent.children.append(TextNode(text=element.tail))
+
+    for child in element:
+        add_child_node(node, child)
+
+
+def print_html_string(root_node: RootNode) -> str:
+    return str(root_node)
diff --git a/misago/markup/links.py b/misago/markup/links.py
@@ -0,0 +1,199 @@
+import re
+from typing import Union
+
+from django.http import Http404
+from django.urls import resolve
+
+from .htmlparser import ElementNode, RootNode, TextNode
+
+MISAGO_ATTACHMENT_VIEWS = ("misago:attachment", "misago:attachment-thumbnail")
+URL_RE = re.compile(
+    r"(https?://)?"
+    r"(www\.)?"
+    r"(\w+((-|_)\w+)?\.)?"
+    r"\w+((_|-|\w)+)?(\.[a-z][a-z]+)"
+    r"(:[1-9][0-9]+)?"
+    r"([^\s<>\[\]\(\);:]+)?"
+)
+
+
+def linkify_texts(node: Union[RootNode, ElementNode]):
+    # Skip link replacement in some nodes
+    if node.tag in ("pre", "code", "a"):
+        return
+
+    new_children = []
+    for child in node.children:
+        if isinstance(child, TextNode):
+            if URL_RE.search(child.text):
+                new_children += replace_links_in_text(child.text)
+            else:
+                new_children.append(child)
+        else:
+            new_children.append(child)
+            linkify_texts(child)
+
+    node.children = new_children
+
+
+def replace_links_in_text(text: str) -> list:
+    nodes = []
+
+    while True:
+        match = URL_RE.search(text)
+        if not match:
+            if text:
+                nodes.append(TextNode(text=text))
+            return nodes
+
+        start, end = match.span()
+        url = text[start:end]
+
+        # Append text between 0 and start to nodes
+        if start > 0:
+            nodes.append(TextNode(text=text[:start]))
+
+        nodes.append(
+            ElementNode(
+                tag="a",
+                attrs={"href": url},
+                children=[
+                    TextNode(text=strip_link_protocol(url)),
+                ],
+            )
+        )
+
+        text = text[end:]
+
+
+def clean_links(
+    request,
+    result,
+    node: Union[RootNode, ElementNode, TextNode],
+    force_shva=False,
+):
+    if isinstance(node, TextNode):
+        return
+
+    for child in node.children:
+        if not isinstance(child, ElementNode):
+            continue
+
+        if child.tag == "a":
+            clean_link_node(request, result, child, force_shva)
+            clean_links(request, result, child, force_shva)
+        elif child.tag == "img":
+            clean_image_node(request, result, child, force_shva)
+        else:
+            clean_links(request, result, child, force_shva)
+
+
+def clean_link_node(
+    request,
+    result: dict,
+    node: ElementNode,
+    force_shva: bool,
+):
+    host = request.get_host()
+    href = node.attrs.get("href") or "/"
+
+    if is_internal_link(href, host):
+        href = clean_internal_link(href, host)
+        result["internal_links"].append(href)
+        href = clean_attachment_link(href, force_shva)
+    else:
+        result["outgoing_links"].append(strip_link_protocol(href))
+        href = assert_link_prefix(href)
+        node.attrs["rel"] = "external nofollow noopener"
+
+    node.attrs["target"] = "_blank"
+    node.attrs["href"] = href
+
+    if len(node.children) == 0:
+        node.children.append(strip_link_protocol(href))
+    elif len(node.children) == 1 and isinstance(node.children[0], TextNode):
+        text = node.children[0].text
+        if URL_RE.match(text):
+            node.children[0].text = strip_link_protocol(text)
+
+
+def clean_image_node(
+    request,
+    result: dict,
+    node: ElementNode,
+    force_shva: bool,
+):
+    host = request.get_host()
+    src = node.attrs.get("src") or "/"
+
+    node.attrs["alt"] = strip_link_protocol(node.attrs["alt"])
+
+    if is_internal_link(src, host):
+        src = clean_internal_link(src, host)
+        result["images"].append(src)
+        src = clean_attachment_link(src, force_shva)
+    else:
+        result["images"].append(strip_link_protocol(src))
+        src = assert_link_prefix(src)
+
+    node.attrs["src"] = src
+
+
+def is_internal_link(link, host):
+    if link.startswith("/") and not link.startswith("//"):
+        return True
+
+    link = strip_link_protocol(link).lstrip("www.").lower()
+    return link.lower().startswith(host.lstrip("www."))
+
+
+def strip_link_protocol(link):
+    if link.lower().startswith("https:"):
+        link = link[6:]
+    if link.lower().startswith("http:"):
+        link = link[5:]
+    if link.startswith("//"):
+        link = link[2:]
+    return link
+
+
+def assert_link_prefix(link):
+    if link.lower().startswith("https:"):
+        return link
+    if link.lower().startswith("http:"):
+        return link
+    if link.startswith("//"):
+        return "http:%s" % link
+
+    return "http://%s" % link
+
+
+def clean_internal_link(link, host):
+    link = strip_link_protocol(link)
+
+    if link.lower().startswith("www."):
+        link = link[4:]
+    if host.lower().startswith("www."):
+        host = host[4:]
+
+    if link.lower().startswith(host):
+        link = link[len(host) :]
+
+    return link or "/"
+
+
+def clean_attachment_link(link, force_shva=False):
+    try:
+        resolution = resolve(link)
+        if not resolution.namespaces:
+            return link
+        url_name = ":".join(resolution.namespaces + [resolution.url_name])
+    except (Http404, ValueError):
+        return link
+
+    if url_name in MISAGO_ATTACHMENT_VIEWS:
+        if force_shva:
+            link = "%s?shva=1" % link
+        elif link.endswith("?shva=1"):
+            link = link[:-7]
+    return link