diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index b43362c8..4ee0be2b 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -354,15 +354,27 @@ def test_formatting():
Here is a code sample:
`import trafilatura`"""
- my_document = html.fromstring('
Title
Here is a code sample:
import trafilatura\ntrafilatura.extract("")
')
+ my_document = html.fromstring('Title
Here is a code sample:
import something
something.run("somewhere")
Sometimes code is wrapped using pre
and code
:
import trafilatura\ntrafilatura.extract("")
Less often code is wrapped using just pre
:
\n trafilatura.extract("")
')
my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
+ print(my_result)
assert my_result == """### Title
Here is a code sample:
+```
+import something
+something.run("somewhere")
+```
+Sometimes code is wrapped using `pre` and `code`:
+
```
import trafilatura
trafilatura.extract("")
+```
+Less often code is wrapped using just `pre`:
+
+```
+trafilatura.extract("")
```"""
# nested
diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
index 0b8a0b07..f78ffbf4 100644
--- a/trafilatura/htmlprocessing.py
+++ b/trafilatura/htmlprocessing.py
@@ -44,6 +44,8 @@
PRESERVE_IMG_CLEANING = {"figure", "picture", "source"}
+CODE_INDICATORS = ["{", "(\"", "('", "\n "]
+
def tree_cleaning(tree: HtmlElement, options: Extractor) -> HtmlElement:
"Prune the tree by discarding unwanted elements."
@@ -315,8 +317,18 @@ def convert_quotes(elem: _Element) -> None:
code_flag = True
for subelem in code_elems:
subelem.attrib.clear()
+ if _is_code_block(elem.text):
+ code_flag = True
elem.tag = "code" if code_flag else "quote"
+def _is_code_block(text: Optional[str]) -> bool:
+ "Check if the element text is part of a code block."
+ if not text:
+ return False
+ for indicator in CODE_INDICATORS:
+ if indicator in text:
+ return True
+ return False
def convert_headings(elem: _Element) -> None:
"Add head tags and delete attributes."
diff --git a/trafilatura/xml.py b/trafilatura/xml.py
index a31e70da..a37fdcd4 100644
--- a/trafilatura/xml.py
+++ b/trafilatura/xml.py
@@ -34,8 +34,8 @@
CONTROL_PARSER = XMLParser(remove_blank_text=True)
-NEWLINE_ELEMS = {'code', 'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table'}
-SPECIAL_FORMATTING = {'del', 'head', 'hi', 'ref'}
+NEWLINE_ELEMS = {'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table'}
+SPECIAL_FORMATTING = {'code', 'del', 'head', 'hi', 'ref'}
WITH_ATTRIBUTES = {'cell', 'row', 'del', 'graphic', 'head', 'hi', 'item', 'list', 'ref'}
NESTING_WHITELIST = {"cell", "figure", "item", "note", "quote"}
@@ -251,8 +251,8 @@ def validate_tei(xmldoc: _Element) -> bool:
def replace_element_text(element: _Element, include_formatting: bool) -> str:
- "Determine element text based on just the text of the element. One must deal with the tail separately."
elem_text = element.text or ""
+ "Determine element text based on just the text of the element. One must deal with the tail separately."
# handle formatting: convert to markdown
if include_formatting and element.text:
if element.tag == "head":
@@ -268,8 +268,12 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
if rend in HI_FORMATTING:
elem_text = f"{HI_FORMATTING[rend]}{elem_text}{HI_FORMATTING[rend]}"
elif element.tag == "code":
- if "\n" in element.text:
- elem_text = f"```\n{elem_text}\n```"
+ if "\n" in elem_text or element.xpath(".//lb"): # Handle
inside
+ # Convert
to \n within code blocks
+ for lb in element.xpath(".//lb"):
+ elem_text = f"{elem_text}\n{lb.tail}"
+ lb.getparent().remove(lb)
+ elem_text = f"```\n{elem_text}\n```\n"
else:
elem_text = f"`{elem_text}`"
# handle links