extraction: detect code in pre-tag and fix code formatting (#776)

* Ensures formatting of code blocks * Adds indicators to detect code more effectively * Avoids unwanted line breaks on inline code * Avoids spaces after inline code * Fixes convertion of line breaks in code blocks * Uses faster approach to detect code * fix mypy error and add function description * fix mypy syntax for old versions --------- Co-authored-by: Adrien Barbaresi <[email protected]>
adbar · Feb 7, 2025 · fbdffe3 · fbdffe3
1 parent 051bf5f
commit fbdffe3
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 6 deletions.
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -354,15 +354,27 @@ def test_formatting():
 Here is a code sample:
 
 `import trafilatura`"""
-    my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code>import trafilatura\ntrafilatura.extract("")</code></p></article></body></html>')
+    my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code><span>import</span> <span>something</span><br/>something.run("somewhere")</code><p>Sometimes code is wrapped using <code>pre</code> and <code>code</code>:</p><pre><code>import trafilatura\ntrafilatura.extract("")</code></pre><p>Less often code is wrapped using just <code>pre</code>:</p><pre>\n    trafilatura.extract("")</pre></article></body></html>')
     my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
+    print(my_result)
     assert my_result == """### Title
 
 Here is a code sample:
 
+```
+import something
+something.run("somewhere")
+```
+Sometimes code is wrapped using `pre` and `code`:
+
 ```
 import trafilatura
 trafilatura.extract("")
+```
+Less often code is wrapped using just `pre`:
+
+```
+trafilatura.extract("")
 ```"""
 
     # nested

diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
@@ -44,6 +44,8 @@
 
 PRESERVE_IMG_CLEANING = {"figure", "picture", "source"}
 
+CODE_INDICATORS = ["{", "(\"", "('", "\n    "]
+
 
 def tree_cleaning(tree: HtmlElement, options: Extractor) -> HtmlElement:
     "Prune the tree by discarding unwanted elements."
@@ -315,8 +317,18 @@ def convert_quotes(elem: _Element) -> None:
             code_flag = True
             for subelem in code_elems:
                 subelem.attrib.clear()
+        if _is_code_block(elem.text):
+            code_flag = True
     elem.tag = "code" if code_flag else "quote"
 
+def _is_code_block(text: Optional[str]) -> bool:
+    "Check if the element text is part of a code block."
+    if not text:
+        return False
+    for indicator in CODE_INDICATORS:
+        if indicator in text:
+            return True
+    return False
 
 def convert_headings(elem: _Element) -> None:
     "Add head tags and delete attributes."

diff --git a/trafilatura/xml.py b/trafilatura/xml.py
@@ -34,8 +34,8 @@
 
 CONTROL_PARSER = XMLParser(remove_blank_text=True)
 
-NEWLINE_ELEMS = {'code', 'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table'}
-SPECIAL_FORMATTING = {'del', 'head', 'hi', 'ref'}
+NEWLINE_ELEMS = {'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table'}
+SPECIAL_FORMATTING = {'code', 'del', 'head', 'hi', 'ref'}
 WITH_ATTRIBUTES = {'cell', 'row', 'del', 'graphic', 'head', 'hi', 'item', 'list', 'ref'}
 NESTING_WHITELIST = {"cell", "figure", "item", "note", "quote"}
 
@@ -251,8 +251,8 @@ def validate_tei(xmldoc: _Element) -> bool:
 
 
 def replace_element_text(element: _Element, include_formatting: bool) -> str:
-    "Determine element text based on just the text of the element. One must deal with the tail separately."
     elem_text = element.text or ""
+    "Determine element text based on just the text of the element. One must deal with the tail separately."
     # handle formatting: convert to markdown
     if include_formatting and element.text:
         if element.tag == "head":
@@ -268,8 +268,12 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
             if rend in HI_FORMATTING:
                 elem_text = f"{HI_FORMATTING[rend]}{elem_text}{HI_FORMATTING[rend]}"
         elif element.tag == "code":
-            if "\n" in element.text:
-                elem_text = f"```\n{elem_text}\n```"
+            if "\n" in elem_text or element.xpath(".//lb"):  # Handle <br> inside <code>
+                # Convert <br> to \n within code blocks
+                for lb in element.xpath(".//lb"):
+                    elem_text = f"{elem_text}\n{lb.tail}"
+                    lb.getparent().remove(lb)
+                elem_text = f"```\n{elem_text}\n```\n"
             else:
                 elem_text = f"`{elem_text}`"
     # handle links