Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extraction: improve spacing in item, cell and code blocks #772

Merged
merged 12 commits into from
Feb 17, 2025
51 changes: 42 additions & 9 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,10 +330,10 @@ def test_formatting():
my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG)
assert '<hi rend="#b">This here is in bold font.</hi>' in my_result
# titles as markdown
my_string = '<html><body><article><h3>Title</h3><p><b>This here is in bold font.</b></p></article></body></html>'
my_string = '<html><body><article><h3>Title</h3><p><b>This here is in bold font.</b>Non-bold here</p></article></body></html>'
my_document = html.fromstring(my_string)
my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
assert my_result == '### Title\n\n**This here is in bold font.**'
assert my_result == '### Title\n\n**This here is in bold font.**Non-bold here'
assert extract(my_string, output_format='markdown', config=ZERO_CONFIG) == my_result
assert '<hi rend="#b">' in etree.tostring(bare_extraction(my_string, output_format='markdown', config=ZERO_CONFIG).body, encoding="unicode")

Expand All @@ -354,7 +354,7 @@ def test_formatting():
Here is a code sample:

`import trafilatura`"""
my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code><span>import</span> <span>something</span><br/>something.run("somewhere")</code><p>Sometimes code is wrapped using <code>pre</code> and <code>code</code>:</p><pre><code>import trafilatura\ntrafilatura.extract("")</code></pre><p>Less often code is wrapped using just <code>pre</code>:</p><pre>\n trafilatura.extract("")</pre></article></body></html>')
my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code><span>import</span> <span>something</span><br/>something.run("somewhere")</code><p>Sometimes code is wrapped using <code>pre</code> and <code>code</code>:</p><pre><code>import trafilatura\ntrafilatura.extract("")</code></pre><p>Less often code is wrapped using just <code>pre</code>:</p><pre>\ntrafilatura.extract("")</pre></article></body></html>')
my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
print(my_result)
assert my_result == """### Title
Expand Down Expand Up @@ -419,6 +419,18 @@ def test_formatting():
my_result = extract(my_document, output_format='xml', include_links=True, config=ZERO_CONFIG)
assert '<item>Number <ref target="test.html">2</ref></item>' in my_result

my_document = html.fromstring("""<html><body><article>
<ul>
<li>Number 0</li>
<li>Number <a href="test.html">1</a></li>
<li><a href="test.html">Number 2</a> n2</li>
<li>Number 3</li>
<li><p>Number 4</p> n4</li>
</ul>
Test</article></body></html>
""")
my_result = extract(my_document, output_format='markdown', include_links=True, config=ZERO_CONFIG)
assert my_result == '- Number 0\n- Number [1](test.html)\n- [Number 2](test.html)n2\n- Number 3\n- Number 4 n4\n\nTest'
# XML and Markdown formatting within <p>-tag
my_document = html.fromstring('<html><body><p><b>bold</b>, <i>italics</i>, <tt>tt</tt>, <strike>deleted</strike>, <u>underlined</u>, <a href="test.html">link</a> and additional text to bypass detection.</p></body></html>')
my_result = extract(copy(my_document), fast=True, include_formatting=False, config=ZERO_CONFIG)
Expand Down Expand Up @@ -454,6 +466,27 @@ def test_formatting():
my_result = extract(my_document, output_format='xml', fast=True, include_formatting=True, config=ZERO_CONFIG)
assert '<head rend="h4">1) The <code>in</code> Operator</head>' in my_result and '<p>The easiest way to check if a Python string contains a substring is to use the <code>in</code> operator. The <code>in</code> operator is used to check data structures for membership in Python. It returns a Boolean (either <code>True</code> or <code>False</code>) and can be used as follows:</p>' in my_result

my_document = html.fromstring("""
<html><head><body><article>python code below:
<pre><code>
def test:
print('hello')
print('world')
</code></pre>
</article></body></html>
""")
my_result = extract(my_document, output_format='markdown', include_formatting=True)
assert "python code below:\n```\ndef test:\n print('hello')\n print('world')\n \n```" == my_result

my_result = extract(my_document, output_format='markdown', include_formatting=True)
assert """python code below:
```
def test:
print('hello')
print('world')

```""" == my_result


def test_extract_with_metadata():
'''Test extract_with_metadata method'''
Expand Down Expand Up @@ -1278,7 +1311,7 @@ def test_table_processing():
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a | b | c |\n| a | b c | |"
assert result == "| a | b | c | \n| a | b c | |"

htmlstring = """
<html><body><article>
Expand All @@ -1296,7 +1329,7 @@ def test_table_processing():
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a | b | c |\n| a | b c | |\n| a | b c | |"
assert result == "| a | b | c | \n| a | b c | |\n| a | b c | |"

htmlstring = """
<html><body><article>
Expand All @@ -1312,7 +1345,7 @@ def test_table_processing():
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| a ![img](http://aa.bb/c.jpg) a | b c | d |"
assert result == "| a | b | c | \n| a ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
Expand All @@ -1328,7 +1361,7 @@ def test_table_processing():
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |"
assert result == "| a | b | c | \n| ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
Expand All @@ -1344,7 +1377,7 @@ def test_table_processing():
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |"
assert result == "| a | b | c | \n| ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
Expand All @@ -1360,7 +1393,7 @@ def test_table_processing():
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg) | b c | d |"
assert result == "| a | b | c | \n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg) | b c | d |"


def test_list_processing():
Expand Down
2 changes: 1 addition & 1 deletion tests/xml_tei_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ def test_replace_element_text():
elem = Element("item")
elem.text = "Test text"
elem.tag = "item"
assert replace_element_text(elem, True) == "- Test text\n"
assert replace_element_text(elem, True) == "- Test text"

elem = Element("ref")
elem.text = "Link"
Expand Down
5 changes: 3 additions & 2 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ def determine_returnstring(document: Document, options: Extractor) -> str:
header = ""
returnstring = f"{header}{xmltotxt(document.body, options.formatting)}"
if document.commentsbody is not None:
returnstring = f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting)}".strip()
returnstring = \
f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting)}".strip()
# normalize Unicode format (defaults to NFC)
return normalize_unicode(returnstring)

Expand Down Expand Up @@ -487,7 +488,7 @@ def extract_with_metadata(
include_images: Take images into account (experimental).
include_formatting: Keep structural elements related to formatting
(only valuable if output_format is set to XML).
include_links: Keep links along with their targets (experimental).
= include_links: Keep links along with their targets (experimental).
Copy link
Owner

@adbar adbar Feb 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be removed, right?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@unsleepy22 Could you please remove it if necessary?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, maybe mis-typed this.

deduplicate: Remove duplicate segments and documents.
date_extraction_params: Provide extraction parameters to htmldate as dict().
url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
Expand Down
18 changes: 16 additions & 2 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from functools import lru_cache
from itertools import islice
from typing import Any, List, Literal, Optional, Tuple, Union
from typing import Any, cast, List, Literal, Optional, Tuple, Union
from unicodedata import normalize

# response compression
Expand Down Expand Up @@ -464,4 +464,18 @@ def copy_attributes(dest_elem: _Element, src_elem: _Element) -> None:

def is_in_table_cell(elem: _Element) -> bool:
'''Check whether an element is in a table cell'''
return bool(elem.xpath('//ancestor::cell'))
return elem.getparent() is not None and bool(elem.xpath('//ancestor::cell'))


def is_last_element_in_cell(elem: _Element) -> bool:
'''Check whether an element is the last element in table cell'''
if not is_in_table_cell(elem): # shortcut
return False

if elem.tag == "cell":
children = elem.getchildren()
return not children or children[-1] == elem
else:
parent = cast(_Element, elem.getparent())
children = parent.getchildren()
return not children or children[-1] == elem
67 changes: 53 additions & 14 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
fromstring, tostring, DTD)

from .settings import Document, Extractor
from .utils import is_in_table_cell, sanitize, sanitize_tree, text_chars_test
from .utils import is_in_table_cell, is_last_element_in_cell, sanitize, sanitize_tree, text_chars_test


LOGGER = logging.getLogger(__name__)
Expand All @@ -35,7 +35,7 @@
CONTROL_PARSER = XMLParser(remove_blank_text=True)

NEWLINE_ELEMS = {'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table'}
SPECIAL_FORMATTING = {'code', 'del', 'head', 'hi', 'ref'}
SPECIAL_FORMATTING = {'code', 'del', 'head', 'hi', 'ref', 'item', 'cell'}
WITH_ATTRIBUTES = {'cell', 'row', 'del', 'graphic', 'head', 'hi', 'item', 'list', 'ref'}
NESTING_WHITELIST = {"cell", "figure", "item", "note", "quote"}

Expand Down Expand Up @@ -249,13 +249,45 @@ def validate_tei(xmldoc: _Element) -> bool:

return result

def is_element_in_item(element: _Element) -> bool:
"""Check whether an element is a list item or within a list item"""
return element.tag == 'item' or bool(element.xpath('ancestor::item'))


def is_first_element_in_item(element: _Element) -> bool:
"""Check whether an element is the first element in list item"""
if element.tag == 'item' and element.text:
return True
for sub_elem in element.xpath('ancestor::item'):
if not sub_elem.text:
return True
return False


def is_last_element_in_item(element: _Element) -> bool:
"""Check whether an element is the last element in list item"""
if not is_element_in_item(element):
return False

# pure text only in list item
if element.tag == 'item':
return len(element.getchildren()) == 0
# element within list item
next_element = element.getnext()
if next_element is None:
return True
else:
return next_element.tag == 'item'


def replace_element_text(element: _Element, include_formatting: bool) -> str:
"""Determine element text based on just the text of the element. One must deal with the tail separately."""
elem_text = element.text or ""
"Determine element text based on just the text of the element. One must deal with the tail separately."
# handle formatting: convert to markdown
if include_formatting and element.text:
if element.tag == "head":
if element.tag in ('article', 'list', 'table'):
elem_text = elem_text.strip()
elif element.tag == "head":
try:
number = int(element.get("rend")[1]) # type: ignore[index]
except (TypeError, ValueError):
Expand Down Expand Up @@ -289,14 +321,16 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
else:
LOGGER.warning("empty link: %s %s", elem_text, element.attrib)
# cells
if element.tag == "cell":
if element.tag == 'cell':
elem_text = elem_text.strip()

if elem_text:
if elem_text and not is_last_element_in_cell(element):
elem_text = f"{elem_text} "
# lists
elif element.tag == "item" and elem_text:
elem_text = f"- {elem_text}\n"

# within lists
if is_first_element_in_item(element) and not is_in_table_cell(element):
elem_text = f"- {elem_text}"

return elem_text


Expand Down Expand Up @@ -344,25 +378,30 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
returnlist.append(f'\n|{"---|" * max_span}\n')
else:
returnlist.append("\n")
elif element.tag != "cell":
elif element.tag != "cell" and element.tag != 'item':
# cells still need to append vertical bars
# but nothing more to do with other textless elements
return

# Process text

# Common elements (Now processes end-tag logic correctly)
if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell"):
if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell") and not is_element_in_item(element):
# spacing hack
returnlist.append("\n\u2424\n" if include_formatting and element.tag != 'row' else "\n")
elif element.tag == "cell":
returnlist.append(" | ")
elif element.tag not in SPECIAL_FORMATTING:
elif element.tag not in SPECIAL_FORMATTING and not is_last_element_in_cell(element): # and not is_in_table_cell(element)
returnlist.append(" ")

# this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
# unless it's within a list item or a table
if element.tail and not is_in_table_cell(element):
returnlist.append(element.tail)
returnlist.append(element.tail.strip() if is_element_in_item(element) or element.tag=='list' else element.tail)

# deal with list items alone
if is_last_element_in_item(element) and not is_in_table_cell(element):
returnlist.append('\n')


def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:
Expand All @@ -374,7 +413,7 @@ def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:

process_element(xmloutput, returnlist, include_formatting)

return unescape(sanitize("".join(returnlist)) or "")
return unescape(sanitize("".join(returnlist), True) or "")


def xmltocsv(document: Document, include_formatting: bool, *, delim: str = "\t", null: str = "null") -> str:
Expand Down
Loading