Skip to content

Commit

Permalink
maintenance: simplify code (#408)
Browse files Browse the repository at this point in the history
* maintenance: simplify code

* simplify further

* simplify CLI code
  • Loading branch information
adbar authored Aug 25, 2023
1 parent 46fb398 commit 088283c
Show file tree
Hide file tree
Showing 8 changed files with 152 additions and 212 deletions.
2 changes: 1 addition & 1 deletion tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ def test_crawling():

def test_probing():
"Test webpage probing functions."
url = 'https://httpbun.org/html'
url = 'https://example.org/'
testargs = ['', '--probe', url, '--target-language', 'de']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
Expand Down
79 changes: 37 additions & 42 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,45 +42,45 @@
STRIP_DIR = re.compile(r'[^/]+$')
STRIP_EXTENSION = re.compile(r'\.[a-z]{2,5}$')

INPUT_URLS_ARGS = ['URL', 'crawl', 'explore', 'probe', 'feed', 'sitemap']

EXTENSION_MAPPING = {
'csv': '.csv',
'json': '.json',
'xml': '.xml',
'xmltei': '.xml',
}


def load_input_urls(args):
'''Read list of URLs to process or derive one from command-line arguments'''
input_urls = []

if args.input_file:
input_urls = []
try:
# optional: errors='strict', buffering=1
with open(args.input_file, mode='r', encoding='utf-8') as inputfile:
for line in inputfile:
input_urls.append(line.strip())
input_urls.extend(line.strip() for line in inputfile)
except UnicodeDecodeError:
sys.exit('ERROR: system, file type or buffer encoding')
elif args.URL:
input_urls = [args.URL]
elif args.crawl:
input_urls = [args.crawl]
elif args.explore:
input_urls = [args.explore]
elif args.probe:
input_urls = [args.probe]
elif args.feed:
input_urls = [args.feed]
elif args.sitemap:
input_urls = [args.sitemap]
else:
for arg in INPUT_URLS_ARGS:
if getattr(args, arg):
input_urls = [getattr(args, arg)]
break

if not input_urls:
LOGGER.warning('No input provided')
input_urls = []

# uniq URLs while preserving order (important)
return uniquify_list(input_urls)


def load_blacklist(filename):
'''Read list of unwanted URLs'''
blacklist = set()
with open(filename, mode='r', encoding='utf-8') as inputfh:
for line in inputfh:
url = line.strip()
# if validate_url(url)[0] is True:
blacklist.add(URL_BLACKLIST_REGEX.sub('', url))
with open(filename, 'r', encoding='utf-8') as inputfh:
# if validate_url(url)[0] is True:
blacklist = {URL_BLACKLIST_REGEX.sub('', line.strip()) for line in inputfh}
return blacklist


Expand Down Expand Up @@ -139,28 +139,23 @@ def get_writable_path(destdir, extension):

def determine_output_path(args, orig_filename, content, counter=None, new_filename=None):
'''Pick a directory based on selected options and a file name based on output type'''
# determine extension
extension = '.txt'
if args.output_format in ('xml', 'xmltei'):
extension = '.xml'
elif args.output_format == 'csv':
extension = '.csv'
elif args.output_format == 'json':
extension = '.json'
# determine directory
if args.keep_dirs is True:
# determine extension, TXT by default
extension = EXTENSION_MAPPING.get(args.output_format, '.txt')

if args.keep_dirs:
# strip directory
orig_directory = STRIP_DIR.sub('', orig_filename)
destination_directory = path.join(args.output_dir, orig_directory)
original_dir = STRIP_DIR.sub('', orig_filename)
destination_dir = path.join(args.output_dir, original_dir)
# strip extension
filename = STRIP_EXTENSION.sub('', orig_filename)
output_path = path.join(args.output_dir, filename + extension)
else:
destination_directory = determine_counter_dir(args.output_dir, counter)
destination_dir = determine_counter_dir(args.output_dir, counter)
# use cryptographic hash on file contents to define name
filename = new_filename or generate_hash_filename(content)
output_path = path.join(destination_directory, filename + extension)
return output_path, destination_directory

output_path = path.join(destination_dir, filename + extension)
return output_path, destination_dir



def archive_html(htmlstring, args, counter=None):
Expand All @@ -182,9 +177,9 @@ def write_result(result, args, orig_filename=None, counter=None, new_filename=No
if args.output_dir is None:
sys.stdout.write(result + '\n')
else:
destination_path, destination_directory = determine_output_path(args, orig_filename, result, counter, new_filename)
destination_path, destination_dir = determine_output_path(args, orig_filename, result, counter, new_filename)
# check the directory status
if check_outputdir_status(destination_directory) is True:
if check_outputdir_status(destination_dir) is True:
with open(destination_path, mode='w', encoding='utf-8') as outputfile:
outputfile.write(result)

Expand Down Expand Up @@ -268,8 +263,8 @@ def cli_discovery(args):

def build_exploration_dict(url_store, input_urls, args):
"Find domains for which nothing has been found and add info to the crawl dict."
input_domains = set(extract_domain(u) for u in input_urls)
known_domains = set(extract_domain(u) for u in url_store.get_known_domains())
input_domains = {extract_domain(u) for u in input_urls}
known_domains = {extract_domain(u) for u in url_store.get_known_domains()}
still_to_crawl = input_domains - known_domains
new_input_urls = [u for u in input_urls if extract_domain(u) in still_to_crawl]
control_dict = add_to_compressed_dict(
Expand Down
4 changes: 2 additions & 2 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -827,8 +827,8 @@ def determine_returnstring(document, output_format, include_formatting, tei_vali
else:
returnstring = xmltotxt(document.body, include_formatting)
if document.commentsbody is not None:
returnstring += '\n' + xmltotxt(document.commentsbody, include_formatting)
returnstring = returnstring.strip()
comments_text = xmltotxt(document.commentsbody, include_formatting)
returnstring = f"{returnstring}\n{comments_text}".strip()
# normalize Unicode format (defaults to NFC)
return normalize_unicode(returnstring)

Expand Down
17 changes: 5 additions & 12 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,28 +206,21 @@ def is_live_page(url):

def add_to_compressed_dict(inputlist, blacklist=None, url_filter=None, url_store=None, compression=False, verbose=False):
'''Filter, convert input URLs and add them to domain-aware processing dictionary'''
# init
if url_store is None:
url_store = UrlStore(
compressed=compression,
strict=False,
verbose=verbose
)
# deduplicate while keeping order

inputlist = uniquify_list(inputlist)
# filter

if blacklist:
inputlist = [u for u in inputlist if URL_BLACKLIST_REGEX.sub('', u) not in blacklist]

if url_filter:
filtered_list = []
while inputlist:
u = inputlist.pop()
for f in url_filter:
if f in u:
filtered_list.append(u)
break
inputlist = filtered_list
# validate and store
inputlist = [u for u in inputlist if any(f in u for f in url_filter)]

url_store.add_urls(inputlist)
return url_store

Expand Down
36 changes: 12 additions & 24 deletions trafilatura/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,21 +61,15 @@ def check_html_lang(tree, target_language, strict=False):
'''Check HTML meta-elements for language information and split
the result in case there are several languages'''
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Language
target_elements = tree.findall('.//meta[@http-equiv="content-language"][@content]')
if target_elements:
for elem in target_elements:
if target_language in RE_HTML_LANG.split(elem.get('content').lower()):
return True
LOGGER.debug('HTML content-language failed')
return False
# locale
target_elements = tree.findall('.//meta[@property="og:locale"][@content]')
if target_elements:
for elem in target_elements:
if target_language in RE_HTML_LANG.split(elem.get('content').lower()):
return True
LOGGER.debug('HTML og:locale failed')
return False
target_attrs = ['http-equiv="content-language"', 'property="og:locale"']
for attr in target_attrs:
target_elements = tree.findall(f'.//meta[@{attr}][@content]')
if target_elements:
for elem in target_elements:
if target_language in RE_HTML_LANG.split(elem.get('content', '').lower()):
return True
LOGGER.debug('%s failed', attr)
return False
# HTML lang attribute: sometimes a wrong indication
if strict is True:
target_elements = tree.xpath('//html[@lang]')
Expand Down Expand Up @@ -122,19 +116,13 @@ def language_filter(temp_text, temp_comments, target_language, docmeta):

def textfilter(element):
'''Filter out unwanted text'''
# print('#', element.text)
if element.text is None and element.tail is not None:
testtext = element.tail
else:
testtext = element.text
if text_chars_test(testtext) is False:
return True
testtext = element.tail if element.text is None else element.text
# to check: line len → continue if len(line) <= 5
return any(RE_FILTER.match(line) for line in testtext.splitlines())
return not text_chars_test(testtext) or any(map(RE_FILTER.match, testtext.splitlines()))


def text_chars_test(string):
'''Determine if a string is only composed of spaces and/or control characters'''
# or not re.search(r'\w', string)
# return string is not None and len(string) != 0 and not string.isspace()
return string not in (None, '') and not string.isspace()
return bool(string) and not string.isspace()
82 changes: 35 additions & 47 deletions trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,21 @@
)


REND_TAG_MAPPING = {
'em': '#i',
'i': '#i',
'b': '#b',
'strong': '#b',
'u': '#u',
'kbd': '#t',
'samp': '#t',
'tt': '#t',
'var': '#t',
'sub': '#sub',
'sup': '#sup'
}


def tree_cleaning(tree, options):
'''Prune the tree by discarding unwanted elements'''
# determine cleaning strategy, use lists to keep it deterministic
Expand Down Expand Up @@ -117,19 +132,16 @@ def collect_link_info(links_xpath, favor_precision=False):
# init
shortelems, mylist = 0, []
# longer strings impact recall in favor of precision
if favor_precision is False:
threshold = 10
else:
threshold = 50
threshold = 10 if not favor_precision else 50
# examine the elements
for subelem in links_xpath:
subelemtext = trim(subelem.text_content())
if not subelemtext:
continue
mylist.append(subelemtext)
lengths = [len(text) for text in mylist]
shortelems = len([l for l in lengths if l < threshold])
return sum(lengths), len(mylist), shortelems, mylist
if subelemtext:
mylist.append(subelemtext)
if len(subelemtext) < threshold:
shortelems += 1
lengths = sum(len(text) for text in mylist)
return lengths, len(mylist), shortelems, mylist


def link_density_test(element, text, favor_precision=False):
Expand Down Expand Up @@ -222,10 +234,9 @@ def convert_tags(tree, options, url=None):
'''Simplify markup and convert relevant HTML tags to an XML standard'''
# delete links for faster processing
if options.links is False:
xpath_expr = './/div//a|.//ul//a' # .//p//a ?
if options.tables is True:
xpath_expr = './/div//a|.//table//a|.//ul//a' # .//p//a ?
else:
xpath_expr = './/div//a|.//ul//a' # .//p//a ?
xpath_expr += '|.//table//a'
# necessary for further detection
for elem in tree.xpath(xpath_expr):
elem.tag = 'ref'
Expand All @@ -246,32 +257,12 @@ def convert_tags(tree, options, url=None):
elem.set('target', target)
# include_formatting
if options.formatting is False:
strip_tags(tree, 'em', 'i', 'b', 'strong', 'u', 'kbd', 'samp', 'tt', 'var', 'sub', 'sup')
strip_tags(tree, *REND_TAG_MAPPING)
else:
for elem in tree.iter('em', 'i', 'b', 'strong', 'u', 'kbd', 'samp', 'tt', 'var', 'sub', 'sup'):
# italics
if elem.tag in ('em', 'i'):
elem.tag = 'hi'
elem.set('rend', '#i')
# bold font
elif elem.tag in ('b', 'strong'):
elem.tag = 'hi'
elem.set('rend', '#b')
# u (very rare)
elif elem.tag == 'u':
elem.tag = 'hi'
elem.set('rend', '#u')
# tt (very rare)
elif elem.tag in ('kbd', 'samp', 'tt', 'var'):
elem.tag = 'hi'
elem.set('rend', '#t')
# sub and sup (very rare)
elif elem.tag == 'sub':
elem.tag = 'hi'
elem.set('rend', '#sub')
elif elem.tag == 'sup':
elem.tag = 'hi'
elem.set('rend', '#sup')
for elem in tree.iter(list(REND_TAG_MAPPING)):
attribute = REND_TAG_MAPPING[elem.tag]
elem.tag = 'hi'
elem.set('rend', attribute)
# iterate over all concerned elements
for elem in tree.iter('blockquote', 'br', 'del', 'details', 'dl', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'ol', 'pre', 'q', 's', 'strike', 'ul'):
# ul/ol → list / li → item
Expand All @@ -282,7 +273,7 @@ def convert_tags(tree, options, url=None):
for subelem in elem.iter('dd', 'dt', 'li'):
# keep track of dd/dt items
if subelem.tag in ('dd', 'dt'):
subelem.set('rend', subelem.tag + '-' + str(i))
subelem.set('rend', f"{subelem.tag}-{i}")
# increment counter after <dd> in description list
if subelem.tag == 'dd':
i += 1
Expand Down Expand Up @@ -330,20 +321,18 @@ def handle_textnode(element, options, comments_fix=True, preserve_spaces=False):
if element.text is None:
# try the tail
# LOGGER.debug('using tail for element %s', element.tag)
element.text = element.tail
element.tail = ''
element.text, element.tail = element.tail, ''
# handle differently for br/lb
if comments_fix is True and element.tag == 'lb':
if comments_fix and element.tag == 'lb':
element.tag = 'p'
# trim
if preserve_spaces is False:
element.text = trim(element.text)
if element.tail:
element.tail = trim(element.tail)
# filter content
if not element.text: # or not re.search(r'\w', element.text): # text_content()?
return None
if textfilter(element) is True:
# or not re.search(r'\w', element.text): # text_content()?
if not element.text or textfilter(element) is True:
return None
if options.dedup and duplicate_test(element, options.config) is True:
return None
Expand All @@ -360,8 +349,7 @@ def process_node(element, options):
element.text, element.tail = trim(element.text), trim(element.tail)
# adapt content string
if element.tag != 'lb' and not element.text and element.tail:
element.text = element.tail
element.tail = None
element.text, element.tail = element.tail, None
# content checks
if element.text or element.tail:
if textfilter(element) is True:
Expand Down
4 changes: 1 addition & 3 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,9 +243,7 @@ def txttocsv(text, comments, docmeta):
@lru_cache(maxsize=2**14) # sys.maxunicode = 1114111
def return_printables_and_spaces(char):
'Return a character if it belongs to certain classes'
if char.isprintable() or char.isspace():
return char
return ''
return char if char.isprintable() or char.isspace() else ''


def remove_control_characters(string):
Expand Down
Loading

0 comments on commit 088283c

Please sign in to comment.