From c758c633f56271ea58c27d3c09736996fa0dab61 Mon Sep 17 00:00:00 2001
From: Dennis Burke <dennisburke@prodigy.net>
Date: Fri, 9 Aug 2024 11:27:55 -0400
Subject: [PATCH] replace html5lib with nh3

We're only using the santizer part of html5lib, and it's being
deprecated. It seems nh3 is the recommended replacement at this time.
This change eliminates a series of Deprecation Warnings.
---
 pyproject.toml             |  2 +-
 tests/test_textile.py      |  2 +-
 textile/core.py            | 11 ++++++-----
 textile/tools/sanitizer.py | 11 -----------
 4 files changed, 8 insertions(+), 18 deletions(-)
 delete mode 100644 textile/tools/sanitizer.py
diff --git a/pyproject.toml b/pyproject.toml
index f8e13efe..5ef4a96a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ classifiers = [
 ]
 dynamic = ["version",]
 dependencies = [
-    'html5lib>=1.0.1',
+    'nh3',
     'regex>1.0; implementation_name != "pypy"',
 ]
 requires-python = '>=3.8'
diff --git a/tests/test_textile.py b/tests/test_textile.py
index 8a5e4f8e..a4ec90b7 100644
--- a/tests/test_textile.py
+++ b/tests/test_textile.py
@@ -117,7 +117,7 @@ def test_sanitize():
     assert result == expect
 
     test = """<p style="width: expression(alert('evil'));">a paragraph of evil text</p>"""
-    result = '<p style="">a paragraph of evil text</p>'
+    result = '<p>a paragraph of evil text</p>'
     expect = textile.Textile().parse(test, sanitize=True)
     assert result == expect
 
diff --git a/textile/core.py b/textile/core.py
index ea94ac18..e9d2955b 100644
--- a/textile/core.py
+++ b/textile/core.py
@@ -20,8 +20,9 @@
 import uuid
 from urllib.parse import urlparse, urlsplit, urlunsplit, quote, unquote
 from collections import OrderedDict
+from nh3 import clean
 
-from textile.tools import sanitizer, imagesize
+from textile.tools import imagesize
 from textile.regex_strings import (align_re_s, cls_re_s, pnct_re_s,
                                    regex_snippets, syms_re_s, table_span_re_s)
 from textile.utils import (decode_high, encode_high, encode_html, generate_tag,
@@ -236,12 +237,12 @@ def parse(self, text, rel=None, sanitize=False):
 
         if self.block_tags:
             if self.lite:
-                self.blocktag_whitelist = ['bq', 'p']
+                self.blocktag_allowlist = set(['bq', 'p', 'br'])
                 text = self.block(text)
             else:
-                self.blocktag_whitelist = ['bq', 'p', 'bc', 'notextile',
+                self.blocktag_allowlist = set(['bq', 'p', 'br', 'bc', 'notextile',
                                            'pre', 'h[1-6]', 'fn{0}+'.format(
-                                               regex_snippets['digit']), '###']
+                                               regex_snippets['digit']), '###'])
                 text = self.block(text)
                 text = self.placeNoteLists(text)
         else:
@@ -263,7 +264,7 @@ def parse(self, text, rel=None, sanitize=False):
         text = text.replace('{0}:glyph:'.format(self.uid), '')
 
         if sanitize:
-            text = sanitizer.sanitize(text)
+            text = clean(text, tags=self.blocktag_allowlist)
 
         text = self.retrieveTags(text)
         text = self.retrieveURLs(text)
diff --git a/textile/tools/sanitizer.py b/textile/tools/sanitizer.py
deleted file mode 100644
index 3c7209c6..00000000
--- a/textile/tools/sanitizer.py
+++ /dev/null
@@ -1,11 +0,0 @@
-def sanitize(string):
-    """
-    Ensure that the text does not contain any malicious HTML code which might
-    break the page.
-    """
-    from html5lib import parseFragment, serialize
-
-    parsed = parseFragment(string)
-    clean = serialize(parsed, sanitize=True, omit_optional_tags=False,
-                      quote_attr_values='always')
-    return clean