diff --git a/tests/unit_tests.py b/tests/unit_tests.py index b43362c8..4ee0be2b 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -354,15 +354,27 @@ def test_formatting(): Here is a code sample: `import trafilatura`""" - my_document = html.fromstring('

Title

Here is a code sample:

import trafilatura\ntrafilatura.extract("")

') + my_document = html.fromstring('

Title

Here is a code sample:

import something
something.run("somewhere")

Sometimes code is wrapped using pre and code:

import trafilatura\ntrafilatura.extract("")

Less often code is wrapped using just pre:

\n    trafilatura.extract("")
') my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG) + print(my_result) assert my_result == """### Title Here is a code sample: +``` +import something +something.run("somewhere") +``` +Sometimes code is wrapped using `pre` and `code`: + ``` import trafilatura trafilatura.extract("") +``` +Less often code is wrapped using just `pre`: + +``` +trafilatura.extract("") ```""" # nested diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py index 0b8a0b07..f78ffbf4 100644 --- a/trafilatura/htmlprocessing.py +++ b/trafilatura/htmlprocessing.py @@ -44,6 +44,8 @@ PRESERVE_IMG_CLEANING = {"figure", "picture", "source"} +CODE_INDICATORS = ["{", "(\"", "('", "\n "] + def tree_cleaning(tree: HtmlElement, options: Extractor) -> HtmlElement: "Prune the tree by discarding unwanted elements." @@ -315,8 +317,18 @@ def convert_quotes(elem: _Element) -> None: code_flag = True for subelem in code_elems: subelem.attrib.clear() + if _is_code_block(elem.text): + code_flag = True elem.tag = "code" if code_flag else "quote" +def _is_code_block(text: Optional[str]) -> bool: + "Check if the element text is part of a code block." + if not text: + return False + for indicator in CODE_INDICATORS: + if indicator in text: + return True + return False def convert_headings(elem: _Element) -> None: "Add head tags and delete attributes." diff --git a/trafilatura/xml.py b/trafilatura/xml.py index a31e70da..a37fdcd4 100644 --- a/trafilatura/xml.py +++ b/trafilatura/xml.py @@ -34,8 +34,8 @@ CONTROL_PARSER = XMLParser(remove_blank_text=True) -NEWLINE_ELEMS = {'code', 'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table'} -SPECIAL_FORMATTING = {'del', 'head', 'hi', 'ref'} +NEWLINE_ELEMS = {'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table'} +SPECIAL_FORMATTING = {'code', 'del', 'head', 'hi', 'ref'} WITH_ATTRIBUTES = {'cell', 'row', 'del', 'graphic', 'head', 'hi', 'item', 'list', 'ref'} NESTING_WHITELIST = {"cell", "figure", "item", "note", "quote"} @@ -251,8 +251,8 @@ def validate_tei(xmldoc: _Element) -> bool: def replace_element_text(element: _Element, include_formatting: bool) -> str: - "Determine element text based on just the text of the element. One must deal with the tail separately." elem_text = element.text or "" + "Determine element text based on just the text of the element. One must deal with the tail separately." # handle formatting: convert to markdown if include_formatting and element.text: if element.tag == "head": @@ -268,8 +268,12 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str: if rend in HI_FORMATTING: elem_text = f"{HI_FORMATTING[rend]}{elem_text}{HI_FORMATTING[rend]}" elif element.tag == "code": - if "\n" in element.text: - elem_text = f"```\n{elem_text}\n```" + if "\n" in elem_text or element.xpath(".//lb"): # Handle
inside + # Convert
to \n within code blocks + for lb in element.xpath(".//lb"): + elem_text = f"{elem_text}\n{lb.tail}" + lb.getparent().remove(lb) + elem_text = f"```\n{elem_text}\n```\n" else: elem_text = f"`{elem_text}`" # handle links