From c758c633f56271ea58c27d3c09736996fa0dab61 Mon Sep 17 00:00:00 2001 From: Dennis Burke Date: Fri, 9 Aug 2024 11:27:55 -0400 Subject: [PATCH] replace html5lib with nh3 We're only using the santizer part of html5lib, and it's being deprecated. It seems nh3 is the recommended replacement at this time. This change eliminates a series of Deprecation Warnings. --- pyproject.toml | 2 +- tests/test_textile.py | 2 +- textile/core.py | 11 ++++++----- textile/tools/sanitizer.py | 11 ----------- 4 files changed, 8 insertions(+), 18 deletions(-) delete mode 100644 textile/tools/sanitizer.py diff --git a/pyproject.toml b/pyproject.toml index f8e13efe..5ef4a96a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ ] dynamic = ["version",] dependencies = [ - 'html5lib>=1.0.1', + 'nh3', 'regex>1.0; implementation_name != "pypy"', ] requires-python = '>=3.8' diff --git a/tests/test_textile.py b/tests/test_textile.py index 8a5e4f8e..a4ec90b7 100644 --- a/tests/test_textile.py +++ b/tests/test_textile.py @@ -117,7 +117,7 @@ def test_sanitize(): assert result == expect test = """

a paragraph of evil text

""" - result = '

a paragraph of evil text

' + result = '

a paragraph of evil text

' expect = textile.Textile().parse(test, sanitize=True) assert result == expect diff --git a/textile/core.py b/textile/core.py index ea94ac18..e9d2955b 100644 --- a/textile/core.py +++ b/textile/core.py @@ -20,8 +20,9 @@ import uuid from urllib.parse import urlparse, urlsplit, urlunsplit, quote, unquote from collections import OrderedDict +from nh3 import clean -from textile.tools import sanitizer, imagesize +from textile.tools import imagesize from textile.regex_strings import (align_re_s, cls_re_s, pnct_re_s, regex_snippets, syms_re_s, table_span_re_s) from textile.utils import (decode_high, encode_high, encode_html, generate_tag, @@ -236,12 +237,12 @@ def parse(self, text, rel=None, sanitize=False): if self.block_tags: if self.lite: - self.blocktag_whitelist = ['bq', 'p'] + self.blocktag_allowlist = set(['bq', 'p', 'br']) text = self.block(text) else: - self.blocktag_whitelist = ['bq', 'p', 'bc', 'notextile', + self.blocktag_allowlist = set(['bq', 'p', 'br', 'bc', 'notextile', 'pre', 'h[1-6]', 'fn{0}+'.format( - regex_snippets['digit']), '###'] + regex_snippets['digit']), '###']) text = self.block(text) text = self.placeNoteLists(text) else: @@ -263,7 +264,7 @@ def parse(self, text, rel=None, sanitize=False): text = text.replace('{0}:glyph:'.format(self.uid), '') if sanitize: - text = sanitizer.sanitize(text) + text = clean(text, tags=self.blocktag_allowlist) text = self.retrieveTags(text) text = self.retrieveURLs(text) diff --git a/textile/tools/sanitizer.py b/textile/tools/sanitizer.py deleted file mode 100644 index 3c7209c6..00000000 --- a/textile/tools/sanitizer.py +++ /dev/null @@ -1,11 +0,0 @@ -def sanitize(string): - """ - Ensure that the text does not contain any malicious HTML code which might - break the page. - """ - from html5lib import parseFragment, serialize - - parsed = parseFragment(string) - clean = serialize(parsed, sanitize=True, omit_optional_tags=False, - quote_attr_values='always') - return clean