diff --git a/tests/unit_tests.py b/tests/unit_tests.py index b793b074..29887a6f 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -140,6 +140,13 @@ def test_input(): == '\n\n\n\n' ) + htmlstring = 'Foo
Bar' + beginning = htmlstring[:50].lower() + assert ( + repair_faulty_html(htmlstring, beginning) + == 'Foo
Bar\n' + ) + with pytest.raises(TypeError) as err: assert load_html(123) is None assert 'incompatible' in str(err.value) diff --git a/trafilatura/utils.py b/trafilatura/utils.py index 8cb09793..bd1eee7c 100644 --- a/trafilatura/utils.py +++ b/trafilatura/utils.py @@ -61,7 +61,7 @@ UNICODE_ALIASES = {'utf-8', 'utf_8'} -DOCTYPE_TAG = re.compile("^< ?! ?DOCTYPE.+?/ ?>", re.I) +DOCTYPE_TAG = re.compile("^< ?! ?DOCTYPE[^>]*/[^<]*>", re.I) FAULTY_HTML = re.compile(r"(", re.I) HTML_STRIP_TAGS = re.compile(r'(|<[^>]*>)')