From 7671f3d883af31cb43fcb8b15056f30cddafd58b Mon Sep 17 00:00:00 2001 From: UVMvmfee <186379858+UVMvmfee@users.noreply.github.com> Date: Sat, 7 Dec 2024 13:42:07 +0100 Subject: [PATCH] Fixes eager doctype regex matching when doctype is not followed by a newline. --- tests/unit_tests.py | 7 +++++++ trafilatura/utils.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index b793b074..29887a6f 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -140,6 +140,13 @@ def test_input(): == '\n\n\n\n' ) + htmlstring = 'Foo
Bar' + beginning = htmlstring[:50].lower() + assert ( + repair_faulty_html(htmlstring, beginning) + == 'Foo
Bar\n' + ) + with pytest.raises(TypeError) as err: assert load_html(123) is None assert 'incompatible' in str(err.value) diff --git a/trafilatura/utils.py b/trafilatura/utils.py index 8cb09793..bd1eee7c 100644 --- a/trafilatura/utils.py +++ b/trafilatura/utils.py @@ -61,7 +61,7 @@ UNICODE_ALIASES = {'utf-8', 'utf_8'} -DOCTYPE_TAG = re.compile("^< ?! ?DOCTYPE.+?/ ?>", re.I) +DOCTYPE_TAG = re.compile("^< ?! ?DOCTYPE[^>]*/[^<]*>", re.I) FAULTY_HTML = re.compile(r"(", re.I) HTML_STRIP_TAGS = re.compile(r'(|<[^>]*>)')