Skip to content

Commit

Permalink
extraction: use backup if manual pruning was too harsh
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed May 10, 2022
1 parent 3291ed2 commit 086c610
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 4 deletions.
7 changes: 5 additions & 2 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,10 @@ def delete_by_link_density(subtree, tagname, backtracking=False):
# else: # and not re.search(r'[?!.]', text):
# print(elem.tag, templist)
for elem in uniquify_list(deletions):
elem.getparent().remove(elem)
try:
elem.getparent().remove(elem)
except AttributeError:
pass
return subtree


Expand All @@ -493,7 +496,7 @@ def extract_content(tree, favor_precision=False, favor_recall=False, include_tab
except IndexError:
continue
# prune the rest
subtree = prune_unwanted_nodes(subtree, OVERALL_DISCARD_XPATH)
subtree = prune_unwanted_nodes(subtree, OVERALL_DISCARD_XPATH, with_backup=True)
# prune images
if include_images is False:
subtree = prune_unwanted_nodes(subtree, DISCARD_IMAGE_ELEMENTS)
Expand Down
16 changes: 14 additions & 2 deletions trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import logging
import re

from copy import deepcopy

from lxml.etree import strip_tags
from lxml.html.clean import Cleaner

Expand Down Expand Up @@ -77,8 +79,11 @@ def prune_html(tree):
return tree


def prune_unwanted_nodes(tree, nodelist):
def prune_unwanted_nodes(tree, nodelist, with_backup=False):
'''Prune the HTML tree by removing unwanted sections.'''
if with_backup is True:
old_len = len(tree.text_content()) # ' '.join(tree.itertext())
backup = deepcopy(tree)
for expr in nodelist:
for subtree in tree.xpath(expr):
# preserve tail text from deletion
Expand All @@ -94,7 +99,14 @@ def prune_unwanted_nodes(tree, nodelist):
previous.tail = subtree.tail
# remove the node
subtree.getparent().remove(subtree)
return tree
if with_backup is False:
return tree
else:
new_len = len(tree.text_content())
# todo: adjust for recall and precision settings
if new_len > old_len/7:
return tree
return backup


def collect_link_info(links_xpath):
Expand Down

0 comments on commit 086c610

Please sign in to comment.