diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 7e9beae7..b793b074 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -499,6 +499,13 @@ def test_images(): assert extract('

text

', include_images=True, fast=True) == '' assert extract('

text

', include_images=True, fast=True) == '![a title text](test.jpg)' assert extract('

text

', include_images=True, fast=True) == '![a title text](test.jpg)' + assert extract('

text

', include_images=True, fast=True) == '![a title text](https://a.b/test.jpg)' + + url = 'http://a.b/c/d.html' + assert extract('

text

', url=url, include_images=True, fast=True) == '![a title text](http://a.b/test.jpg)' + assert extract('

text

', url=url, include_images=True, fast=True) == '![a title text](http://a.b/a.b/test.jpg)' + assert extract('

text

', url=url, include_images=True, fast=True) == '![a title text](http://a.b/c/a.b/test.jpg)' + assert extract('

text

', url=url, include_images=True, fast=True) == '![a title text](http://a.b/a.b/test.jpg)' assert handle_image(html.fromstring('text')) is None @@ -1187,6 +1194,38 @@ def test_table_processing(): result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) assert result == "" + htmlstring = """ +
+ + + +
abc
a +

b

+

c

+
+
+ """ + result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) + assert result == "| a | b | c |\n| a | b c | |" + + htmlstring = """ +
+ + + + +
abc
a +

b

+

c

+
a +

b

+

c

+
+
+ """ + result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) + assert result == "| a | b | c |\n| a | b c | |\n| a | b c | |" + def test_list_processing(): options = DEFAULT_OPTIONS diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py index 2a950bec..4af74329 100644 --- a/trafilatura/main_extractor.py +++ b/trafilatura/main_extractor.py @@ -8,6 +8,7 @@ from copy import deepcopy from typing import Any, Optional, Tuple, Set, Union +from urllib.parse import urljoin from lxml.etree import _Element, Element, SubElement, strip_elements, strip_tags, tostring from lxml.html import HtmlElement @@ -333,7 +334,7 @@ def handle_paragraphs(element: _Element, potential_tags: Set[str], options: Extr newsub.text, newsub.tail = processed_child.text, processed_child.tail if processed_child.tag == 'graphic': - image_elem = handle_image(processed_child) + image_elem = handle_image(processed_child, options) if image_elem is not None: newsub = image_elem processed_element.append(newsub) @@ -367,10 +368,16 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac # strip these structural elements strip_tags(table_elem, "thead", "tbody", "tfoot") - # calculate maximum number of columns per row, includin colspan + # calculate maximum number of columns per row, including colspan max_cols = 0 + diff_colspans = set() for tr in table_elem.iter('tr'): - max_cols = max(max_cols, sum(int(td.get("colspan", 1)) for td in tr.iter(TABLE_ELEMS))) + total_colspans = 0 + for td in tr.iter(TABLE_ELEMS): + colspan = int(td.get("colspan", 1)) + diff_colspans.add(colspan) + total_colspans += colspan + max_cols = max(max_cols, total_colspans) # explore sub-elements seen_header_row = False @@ -431,8 +438,9 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac # cleanup subelement.tag = "done" - # clean up row attributes - newrow.attrib.pop("span", None) + # clean up row attributes only when all cells in table share the same colspan + if len(diff_colspans) == 1: + newrow.attrib.pop("span", None) # end of processing if len(newrow) > 0: @@ -442,7 +450,7 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac return None -def handle_image(element: Optional[_Element]) -> Optional[_Element]: +def handle_image(element: Optional[_Element], options: Optional[Extractor] = None) -> Optional[_Element]: "Process image elements and their relevant attributes." if element is None: return None @@ -472,9 +480,13 @@ def handle_image(element: Optional[_Element]) -> Optional[_Element]: return None # post-processing: URLs - src_attr = processed_element.get("src", "") - if not src_attr.startswith("http"): - processed_element.set("src", re.sub(r"^//", "http://", src_attr)) + link = processed_element.get("src", "") + if not link.startswith("http"): + if options is not None and options.url is not None: + link = urljoin(options.url, link) + else: + link = re.sub(r"^//", "http://", link) + processed_element.set("src", link) return processed_element @@ -502,7 +514,7 @@ def handle_textelem(element: _Element, potential_tags: Set[str], options: Extrac elif element.tag == 'table' and 'table' in potential_tags: new_element = handle_table(element, potential_tags, options) elif element.tag == 'graphic' and 'graphic' in potential_tags: - new_element = handle_image(element) + new_element = handle_image(element, options) else: # other elements (div, ??, ??) new_element = handle_other_elements(element, potential_tags, options) diff --git a/trafilatura/xml.py b/trafilatura/xml.py index 953a5f9f..f5e6c57e 100644 --- a/trafilatura/xml.py +++ b/trafilatura/xml.py @@ -285,12 +285,15 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str: else: LOGGER.warning("empty link: %s %s", elem_text, element.attrib) # cells - if element.tag == "cell" and elem_text and len(element) > 0: - if element[0].tag == 'p': - elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} " - elif element.tag == 'cell' and elem_text: - # add | before first cell - elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}" + if element.tag == "cell": + elem_text = elem_text.strip() + + if elem_text and len(element) > 0: + if element[0].tag == 'p': + elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} " + elif elem_text: + # add | before first cell + elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}" # lists elif element.tag == "item" and elem_text: elem_text = f"- {elem_text}\n" @@ -348,7 +351,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting # this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS if element.tail: - returnlist.append(element.tail) + returnlist.append(element.tail.strip() if element.tag == 'cell' else element.tail) def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str: