Skip to content

Commit

Permalink
extraction: detect code in pre-tag and fix code formatting (#776)
Browse files Browse the repository at this point in the history
* Ensures formatting of code blocks

* Adds indicators to detect code more effectively

* Avoids unwanted line breaks on inline code

* Avoids spaces after inline code

* Fixes convertion of line breaks in code blocks

* Uses faster approach to detect code

* fix mypy error and add function description

* fix mypy syntax for old versions

---------

Co-authored-by: Adrien Barbaresi <[email protected]>
  • Loading branch information
steineggerroland and adbar authored Feb 7, 2025
1 parent 051bf5f commit fbdffe3
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 6 deletions.
14 changes: 13 additions & 1 deletion tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,15 +354,27 @@ def test_formatting():
Here is a code sample:
`import trafilatura`"""
my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code>import trafilatura\ntrafilatura.extract("")</code></p></article></body></html>')
my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code><span>import</span> <span>something</span><br/>something.run("somewhere")</code><p>Sometimes code is wrapped using <code>pre</code> and <code>code</code>:</p><pre><code>import trafilatura\ntrafilatura.extract("")</code></pre><p>Less often code is wrapped using just <code>pre</code>:</p><pre>\n trafilatura.extract("")</pre></article></body></html>')
my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
print(my_result)
assert my_result == """### Title
Here is a code sample:
```
import something
something.run("somewhere")
```
Sometimes code is wrapped using `pre` and `code`:
```
import trafilatura
trafilatura.extract("")
```
Less often code is wrapped using just `pre`:
```
trafilatura.extract("")
```"""

# nested
Expand Down
12 changes: 12 additions & 0 deletions trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@

PRESERVE_IMG_CLEANING = {"figure", "picture", "source"}

CODE_INDICATORS = ["{", "(\"", "('", "\n "]


def tree_cleaning(tree: HtmlElement, options: Extractor) -> HtmlElement:
"Prune the tree by discarding unwanted elements."
Expand Down Expand Up @@ -315,8 +317,18 @@ def convert_quotes(elem: _Element) -> None:
code_flag = True
for subelem in code_elems:
subelem.attrib.clear()
if _is_code_block(elem.text):
code_flag = True
elem.tag = "code" if code_flag else "quote"

def _is_code_block(text: Optional[str]) -> bool:
"Check if the element text is part of a code block."
if not text:
return False
for indicator in CODE_INDICATORS:
if indicator in text:
return True
return False

def convert_headings(elem: _Element) -> None:
"Add head tags and delete attributes."
Expand Down
14 changes: 9 additions & 5 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@

CONTROL_PARSER = XMLParser(remove_blank_text=True)

NEWLINE_ELEMS = {'code', 'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table'}
SPECIAL_FORMATTING = {'del', 'head', 'hi', 'ref'}
NEWLINE_ELEMS = {'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table'}
SPECIAL_FORMATTING = {'code', 'del', 'head', 'hi', 'ref'}
WITH_ATTRIBUTES = {'cell', 'row', 'del', 'graphic', 'head', 'hi', 'item', 'list', 'ref'}
NESTING_WHITELIST = {"cell", "figure", "item", "note", "quote"}

Expand Down Expand Up @@ -251,8 +251,8 @@ def validate_tei(xmldoc: _Element) -> bool:


def replace_element_text(element: _Element, include_formatting: bool) -> str:
"Determine element text based on just the text of the element. One must deal with the tail separately."
elem_text = element.text or ""
"Determine element text based on just the text of the element. One must deal with the tail separately."
# handle formatting: convert to markdown
if include_formatting and element.text:
if element.tag == "head":
Expand All @@ -268,8 +268,12 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
if rend in HI_FORMATTING:
elem_text = f"{HI_FORMATTING[rend]}{elem_text}{HI_FORMATTING[rend]}"
elif element.tag == "code":
if "\n" in element.text:
elem_text = f"```\n{elem_text}\n```"
if "\n" in elem_text or element.xpath(".//lb"): # Handle <br> inside <code>
# Convert <br> to \n within code blocks
for lb in element.xpath(".//lb"):
elem_text = f"{elem_text}\n{lb.tail}"
lb.getparent().remove(lb)
elem_text = f"```\n{elem_text}\n```\n"
else:
elem_text = f"`{elem_text}`"
# handle links
Expand Down

0 comments on commit fbdffe3

Please sign in to comment.