maintenance: simplify code (#408)

* maintenance: simplify code * simplify further * simplify CLI code
adbar · Aug 25, 2023 · 088283c · 088283c
1 parent 46fb398
commit 088283c
Show file tree

Hide file tree

Showing 8 changed files with 152 additions and 212 deletions.
diff --git a/tests/cli_tests.py b/tests/cli_tests.py
@@ -448,7 +448,7 @@ def test_crawling():
 
 def test_probing():
     "Test webpage probing functions."
-    url = 'https://httpbun.org/html'
+    url = 'https://example.org/'
     testargs = ['', '--probe', url, '--target-language', 'de']
     with patch.object(sys, 'argv', testargs):
         args = cli.parse_args(testargs)

diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
@@ -42,45 +42,45 @@
 STRIP_DIR = re.compile(r'[^/]+$')
 STRIP_EXTENSION = re.compile(r'\.[a-z]{2,5}$')
 
+INPUT_URLS_ARGS = ['URL', 'crawl', 'explore', 'probe', 'feed', 'sitemap']
+
+EXTENSION_MAPPING = {
+    'csv': '.csv',
+    'json': '.json',
+    'xml': '.xml',
+    'xmltei': '.xml',
+}
+
 
 def load_input_urls(args):
     '''Read list of URLs to process or derive one from command-line arguments'''
+    input_urls = []
+
     if args.input_file:
-        input_urls = []
         try:
             # optional: errors='strict', buffering=1
             with open(args.input_file, mode='r', encoding='utf-8') as inputfile:
-                for line in inputfile:
-                    input_urls.append(line.strip())
+                input_urls.extend(line.strip() for line in inputfile)
         except UnicodeDecodeError:
             sys.exit('ERROR: system, file type or buffer encoding')
-    elif args.URL:
-        input_urls = [args.URL]
-    elif args.crawl:
-        input_urls = [args.crawl]
-    elif args.explore:
-        input_urls = [args.explore]
-    elif args.probe:
-        input_urls = [args.probe]
-    elif args.feed:
-        input_urls = [args.feed]
-    elif args.sitemap:
-        input_urls = [args.sitemap]
     else:
+        for arg in INPUT_URLS_ARGS:
+            if getattr(args, arg):
+                input_urls = [getattr(args, arg)]
+                break
+
+    if not input_urls:
         LOGGER.warning('No input provided')
-        input_urls = []
+
     # uniq URLs while preserving order (important)
     return uniquify_list(input_urls)
 
 
 def load_blacklist(filename):
     '''Read list of unwanted URLs'''
-    blacklist = set()
-    with open(filename, mode='r', encoding='utf-8') as inputfh:
-        for line in inputfh:
-            url = line.strip()
-            # if validate_url(url)[0] is True:
-            blacklist.add(URL_BLACKLIST_REGEX.sub('', url))
+    with open(filename, 'r', encoding='utf-8') as inputfh:
+        # if validate_url(url)[0] is True:
+        blacklist = {URL_BLACKLIST_REGEX.sub('', line.strip()) for line in inputfh}
     return blacklist
 
 
@@ -139,28 +139,23 @@ def get_writable_path(destdir, extension):
 
 def determine_output_path(args, orig_filename, content, counter=None, new_filename=None):
     '''Pick a directory based on selected options and a file name based on output type'''
-    # determine extension
-    extension = '.txt'
-    if args.output_format in ('xml', 'xmltei'):
-        extension = '.xml'
-    elif args.output_format == 'csv':
-        extension = '.csv'
-    elif args.output_format == 'json':
-        extension = '.json'
-    # determine directory
-    if args.keep_dirs is True:
+    # determine extension, TXT by default
+    extension = EXTENSION_MAPPING.get(args.output_format, '.txt')
+
+    if args.keep_dirs:
         # strip directory
-        orig_directory = STRIP_DIR.sub('', orig_filename)
-        destination_directory = path.join(args.output_dir, orig_directory)
+        original_dir = STRIP_DIR.sub('', orig_filename)
+        destination_dir = path.join(args.output_dir, original_dir)
         # strip extension
         filename = STRIP_EXTENSION.sub('', orig_filename)
-        output_path = path.join(args.output_dir, filename + extension)
     else:
-        destination_directory = determine_counter_dir(args.output_dir, counter)
+        destination_dir = determine_counter_dir(args.output_dir, counter)
         # use cryptographic hash on file contents to define name
         filename = new_filename or generate_hash_filename(content)
-        output_path = path.join(destination_directory, filename + extension)
-    return output_path, destination_directory
+
+    output_path = path.join(destination_dir, filename + extension)
+    return output_path, destination_dir
+
 
 
 def archive_html(htmlstring, args, counter=None):
@@ -182,9 +177,9 @@ def write_result(result, args, orig_filename=None, counter=None, new_filename=No
     if args.output_dir is None:
         sys.stdout.write(result + '\n')
     else:
-        destination_path, destination_directory = determine_output_path(args, orig_filename, result, counter, new_filename)
+        destination_path, destination_dir = determine_output_path(args, orig_filename, result, counter, new_filename)
         # check the directory status
-        if check_outputdir_status(destination_directory) is True:
+        if check_outputdir_status(destination_dir) is True:
             with open(destination_path, mode='w', encoding='utf-8') as outputfile:
                 outputfile.write(result)
 
@@ -268,8 +263,8 @@ def cli_discovery(args):
 
 def build_exploration_dict(url_store, input_urls, args):
     "Find domains for which nothing has been found and add info to the crawl dict."
-    input_domains = set(extract_domain(u) for u in input_urls)
-    known_domains = set(extract_domain(u) for u in url_store.get_known_domains())
+    input_domains = {extract_domain(u) for u in input_urls}
+    known_domains = {extract_domain(u) for u in url_store.get_known_domains()}
     still_to_crawl = input_domains - known_domains
     new_input_urls = [u for u in input_urls if extract_domain(u) in still_to_crawl]
     control_dict = add_to_compressed_dict(

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -827,8 +827,8 @@ def determine_returnstring(document, output_format, include_formatting, tei_vali
     else:
         returnstring = xmltotxt(document.body, include_formatting)
         if document.commentsbody is not None:
-            returnstring += '\n' + xmltotxt(document.commentsbody, include_formatting)
-            returnstring = returnstring.strip()
+            comments_text = xmltotxt(document.commentsbody, include_formatting)
+            returnstring = f"{returnstring}\n{comments_text}".strip()
     # normalize Unicode format (defaults to NFC)
     return normalize_unicode(returnstring)
 

diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py
@@ -206,28 +206,21 @@ def is_live_page(url):
 
 def add_to_compressed_dict(inputlist, blacklist=None, url_filter=None, url_store=None, compression=False, verbose=False):
     '''Filter, convert input URLs and add them to domain-aware processing dictionary'''
-    # init
     if url_store is None:
         url_store = UrlStore(
                         compressed=compression,
                         strict=False,
                         verbose=verbose
                     )
-    # deduplicate while keeping order
+
     inputlist = uniquify_list(inputlist)
-    # filter
+
     if blacklist:
         inputlist = [u for u in inputlist if URL_BLACKLIST_REGEX.sub('', u) not in blacklist]
+
     if url_filter:
-        filtered_list = []
-        while inputlist:
-            u = inputlist.pop()
-            for f in url_filter:
-                if f in u:
-                    filtered_list.append(u)
-                    break
-        inputlist = filtered_list
-    # validate and store
+        inputlist = [u for u in inputlist if any(f in u for f in url_filter)]
+
     url_store.add_urls(inputlist)
     return url_store
 

diff --git a/trafilatura/filters.py b/trafilatura/filters.py
@@ -61,21 +61,15 @@ def check_html_lang(tree, target_language, strict=False):
     '''Check HTML meta-elements for language information and split
        the result in case there are several languages'''
     # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Language
-    target_elements = tree.findall('.//meta[@http-equiv="content-language"][@content]')
-    if target_elements:
-        for elem in target_elements:
-            if target_language in RE_HTML_LANG.split(elem.get('content').lower()):
-                return True
-        LOGGER.debug('HTML content-language failed')
-        return False
-    # locale
-    target_elements = tree.findall('.//meta[@property="og:locale"][@content]')
-    if target_elements:
-        for elem in target_elements:
-            if target_language in RE_HTML_LANG.split(elem.get('content').lower()):
-                return True
-        LOGGER.debug('HTML og:locale failed')
-        return False
+    target_attrs = ['http-equiv="content-language"', 'property="og:locale"']
+    for attr in target_attrs:
+        target_elements = tree.findall(f'.//meta[@{attr}][@content]')
+        if target_elements:
+            for elem in target_elements:
+                if target_language in RE_HTML_LANG.split(elem.get('content', '').lower()):
+                    return True
+            LOGGER.debug('%s failed', attr)
+            return False
     # HTML lang attribute: sometimes a wrong indication
     if strict is True:
         target_elements = tree.xpath('//html[@lang]')
@@ -122,19 +116,13 @@ def language_filter(temp_text, temp_comments, target_language, docmeta):
 
 def textfilter(element):
     '''Filter out unwanted text'''
-    # print('#', element.text)
-    if element.text is None and element.tail is not None:
-        testtext = element.tail
-    else:
-        testtext = element.text
-    if text_chars_test(testtext) is False:
-        return True
+    testtext = element.tail if element.text is None else element.text
     # to check: line len → continue if len(line) <= 5
-    return any(RE_FILTER.match(line) for line in testtext.splitlines())
+    return not text_chars_test(testtext) or any(map(RE_FILTER.match, testtext.splitlines()))
 
 
 def text_chars_test(string):
     '''Determine if a string is only composed of spaces and/or control characters'''
     # or not re.search(r'\w', string)
     # return string is not None and len(string) != 0 and not string.isspace()
-    return string not in (None, '') and not string.isspace()
+    return bool(string) and not string.isspace()
diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
@@ -43,6 +43,21 @@
 )
 
 
+REND_TAG_MAPPING = {
+    'em': '#i',
+    'i': '#i',
+    'b': '#b',
+    'strong': '#b',
+    'u': '#u',
+    'kbd': '#t',
+    'samp': '#t',
+    'tt': '#t',
+    'var': '#t',
+    'sub': '#sub',
+    'sup': '#sup'
+}
+
+
 def tree_cleaning(tree, options):
     '''Prune the tree by discarding unwanted elements'''
     # determine cleaning strategy, use lists to keep it deterministic
@@ -117,19 +132,16 @@ def collect_link_info(links_xpath, favor_precision=False):
     # init
     shortelems, mylist = 0, []
     # longer strings impact recall in favor of precision
-    if favor_precision is False:
-        threshold = 10
-    else:
-        threshold = 50
+    threshold = 10 if not favor_precision else 50
     # examine the elements
     for subelem in links_xpath:
         subelemtext = trim(subelem.text_content())
-        if not subelemtext:
-            continue
-        mylist.append(subelemtext)
-    lengths = [len(text) for text in mylist]
-    shortelems = len([l for l in lengths if l < threshold])
-    return sum(lengths), len(mylist), shortelems, mylist
+        if subelemtext:
+            mylist.append(subelemtext)
+            if len(subelemtext) < threshold:
+                shortelems += 1
+    lengths = sum(len(text) for text in mylist)
+    return lengths, len(mylist), shortelems, mylist
 
 
 def link_density_test(element, text, favor_precision=False):
@@ -222,10 +234,9 @@ def convert_tags(tree, options, url=None):
     '''Simplify markup and convert relevant HTML tags to an XML standard'''
     # delete links for faster processing
     if options.links is False:
+        xpath_expr = './/div//a|.//ul//a'  # .//p//a ?
         if options.tables is True:
-            xpath_expr = './/div//a|.//table//a|.//ul//a'  # .//p//a ?
-        else:
-            xpath_expr = './/div//a|.//ul//a'  # .//p//a ?
+            xpath_expr += '|.//table//a'
         # necessary for further detection
         for elem in tree.xpath(xpath_expr):
             elem.tag = 'ref'
@@ -246,32 +257,12 @@ def convert_tags(tree, options, url=None):
                 elem.set('target', target)
     # include_formatting
     if options.formatting is False:
-        strip_tags(tree, 'em', 'i', 'b', 'strong', 'u', 'kbd', 'samp', 'tt', 'var', 'sub', 'sup')
+        strip_tags(tree, *REND_TAG_MAPPING)
     else:
-        for elem in tree.iter('em', 'i', 'b', 'strong', 'u', 'kbd', 'samp', 'tt', 'var', 'sub', 'sup'):
-            # italics
-            if elem.tag in ('em', 'i'):
-                elem.tag = 'hi'
-                elem.set('rend', '#i')
-            # bold font
-            elif elem.tag in ('b', 'strong'):
-                elem.tag = 'hi'
-                elem.set('rend', '#b')
-            # u (very rare)
-            elif elem.tag == 'u':
-                elem.tag = 'hi'
-                elem.set('rend', '#u')
-            # tt (very rare)
-            elif elem.tag in ('kbd', 'samp', 'tt', 'var'):
-                elem.tag = 'hi'
-                elem.set('rend', '#t')
-            # sub and sup (very rare)
-            elif elem.tag == 'sub':
-                elem.tag = 'hi'
-                elem.set('rend', '#sub')
-            elif elem.tag == 'sup':
-                elem.tag = 'hi'
-                elem.set('rend', '#sup')
+        for elem in tree.iter(list(REND_TAG_MAPPING)):
+            attribute = REND_TAG_MAPPING[elem.tag]
+            elem.tag = 'hi'
+            elem.set('rend', attribute)
     # iterate over all concerned elements
     for elem in tree.iter('blockquote', 'br', 'del', 'details', 'dl', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'ol', 'pre', 'q', 's', 'strike', 'ul'):
         # ul/ol → list / li → item
@@ -282,7 +273,7 @@ def convert_tags(tree, options, url=None):
             for subelem in elem.iter('dd', 'dt', 'li'):
                 # keep track of dd/dt items
                 if subelem.tag in ('dd', 'dt'):
-                    subelem.set('rend', subelem.tag + '-' + str(i))
+                    subelem.set('rend', f"{subelem.tag}-{i}")
                     # increment counter after <dd> in description list
                     if subelem.tag == 'dd':
                         i += 1
@@ -330,20 +321,18 @@ def handle_textnode(element, options, comments_fix=True, preserve_spaces=False):
     if element.text is None:
         # try the tail
         # LOGGER.debug('using tail for element %s', element.tag)
-        element.text = element.tail
-        element.tail = ''
+        element.text, element.tail = element.tail, ''
         # handle differently for br/lb
-        if comments_fix is True and element.tag == 'lb':
+        if comments_fix and element.tag == 'lb':
             element.tag = 'p'
     # trim
     if preserve_spaces is False:
         element.text = trim(element.text)
         if element.tail:
             element.tail = trim(element.tail)
     # filter content
-    if not element.text:  # or not re.search(r'\w', element.text):  # text_content()?
-        return None
-    if textfilter(element) is True:
+    # or not re.search(r'\w', element.text):  # text_content()?
+    if not element.text or textfilter(element) is True:  
         return None
     if options.dedup and duplicate_test(element, options.config) is True:
         return None
@@ -360,8 +349,7 @@ def process_node(element, options):
     element.text, element.tail = trim(element.text), trim(element.tail)
     # adapt content string
     if element.tag != 'lb' and not element.text and element.tail:
-        element.text = element.tail
-        element.tail = None
+        element.text, element.tail = element.tail, None
     # content checks
     if element.text or element.tail:
         if textfilter(element) is True:

diff --git a/trafilatura/utils.py b/trafilatura/utils.py
@@ -243,9 +243,7 @@ def txttocsv(text, comments, docmeta):
 @lru_cache(maxsize=2**14)  # sys.maxunicode = 1114111
 def return_printables_and_spaces(char):
     'Return a character if it belongs to certain classes'
-    if char.isprintable() or char.isspace():
-        return char
-    return ''
+    return char if char.isprintable() or char.isspace() else ''
 
 
 def remove_control_characters(string):