From 7671f3d883af31cb43fcb8b15056f30cddafd58b Mon Sep 17 00:00:00 2001
From: UVMvmfee <186379858+UVMvmfee@users.noreply.github.com>
Date: Sat, 7 Dec 2024 13:42:07 +0100
Subject: [PATCH] Fixes eager doctype regex matching when doctype is not
 followed by a newline.

---
 tests/unit_tests.py  | 7 +++++++
 trafilatura/utils.py | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index b793b074..29887a6f 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -140,6 +140,13 @@ def test_input():
         == '<!DOCTYPE html>\n<html lang="en-US">\n<head/>\n<body/>\n</html>'
     )
 
+    htmlstring = '<!DOCTYPE html><html><head></head><body>Foo <br/> Bar</body></html>'
+    beginning = htmlstring[:50].lower()
+    assert (
+        repair_faulty_html(htmlstring, beginning)
+        == '<!DOCTYPE html><html><head></head><body>Foo <br/> Bar</body></html>\n'
+    )
+
     with pytest.raises(TypeError) as err:
         assert load_html(123) is None
     assert 'incompatible' in str(err.value)
diff --git a/trafilatura/utils.py b/trafilatura/utils.py
index 8cb09793..bd1eee7c 100644
--- a/trafilatura/utils.py
+++ b/trafilatura/utils.py
@@ -61,7 +61,7 @@
 
 UNICODE_ALIASES = {'utf-8', 'utf_8'}
 
-DOCTYPE_TAG = re.compile("^< ?! ?DOCTYPE.+?/ ?>", re.I)
+DOCTYPE_TAG = re.compile("^< ?! ?DOCTYPE[^>]*/[^<]*>", re.I)
 FAULTY_HTML = re.compile(r"(<html.*?)\s*/>", re.I)
 HTML_STRIP_TAGS = re.compile(r'(<!--.*?-->|<[^>]*>)')