diff --git a/tests/cli_tests.py b/tests/cli_tests.py index 42f7e884..de2062ae 100644 --- a/tests/cli_tests.py +++ b/tests/cli_tests.py @@ -448,7 +448,7 @@ def test_crawling(): def test_probing(): "Test webpage probing functions." - url = 'https://httpbun.org/html' + url = 'https://example.org/' testargs = ['', '--probe', url, '--target-language', 'de'] with patch.object(sys, 'argv', testargs): args = cli.parse_args(testargs) diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py index 876b1d91..0db1f3d8 100644 --- a/trafilatura/cli_utils.py +++ b/trafilatura/cli_utils.py @@ -42,45 +42,45 @@ STRIP_DIR = re.compile(r'[^/]+$') STRIP_EXTENSION = re.compile(r'\.[a-z]{2,5}$') +INPUT_URLS_ARGS = ['URL', 'crawl', 'explore', 'probe', 'feed', 'sitemap'] + +EXTENSION_MAPPING = { + 'csv': '.csv', + 'json': '.json', + 'xml': '.xml', + 'xmltei': '.xml', +} + def load_input_urls(args): '''Read list of URLs to process or derive one from command-line arguments''' + input_urls = [] + if args.input_file: - input_urls = [] try: # optional: errors='strict', buffering=1 with open(args.input_file, mode='r', encoding='utf-8') as inputfile: - for line in inputfile: - input_urls.append(line.strip()) + input_urls.extend(line.strip() for line in inputfile) except UnicodeDecodeError: sys.exit('ERROR: system, file type or buffer encoding') - elif args.URL: - input_urls = [args.URL] - elif args.crawl: - input_urls = [args.crawl] - elif args.explore: - input_urls = [args.explore] - elif args.probe: - input_urls = [args.probe] - elif args.feed: - input_urls = [args.feed] - elif args.sitemap: - input_urls = [args.sitemap] else: + for arg in INPUT_URLS_ARGS: + if getattr(args, arg): + input_urls = [getattr(args, arg)] + break + + if not input_urls: LOGGER.warning('No input provided') - input_urls = [] + # uniq URLs while preserving order (important) return uniquify_list(input_urls) def load_blacklist(filename): '''Read list of unwanted URLs''' - blacklist = set() - with open(filename, mode='r', encoding='utf-8') as inputfh: - for line in inputfh: - url = line.strip() - # if validate_url(url)[0] is True: - blacklist.add(URL_BLACKLIST_REGEX.sub('', url)) + with open(filename, 'r', encoding='utf-8') as inputfh: + # if validate_url(url)[0] is True: + blacklist = {URL_BLACKLIST_REGEX.sub('', line.strip()) for line in inputfh} return blacklist @@ -139,28 +139,23 @@ def get_writable_path(destdir, extension): def determine_output_path(args, orig_filename, content, counter=None, new_filename=None): '''Pick a directory based on selected options and a file name based on output type''' - # determine extension - extension = '.txt' - if args.output_format in ('xml', 'xmltei'): - extension = '.xml' - elif args.output_format == 'csv': - extension = '.csv' - elif args.output_format == 'json': - extension = '.json' - # determine directory - if args.keep_dirs is True: + # determine extension, TXT by default + extension = EXTENSION_MAPPING.get(args.output_format, '.txt') + + if args.keep_dirs: # strip directory - orig_directory = STRIP_DIR.sub('', orig_filename) - destination_directory = path.join(args.output_dir, orig_directory) + original_dir = STRIP_DIR.sub('', orig_filename) + destination_dir = path.join(args.output_dir, original_dir) # strip extension filename = STRIP_EXTENSION.sub('', orig_filename) - output_path = path.join(args.output_dir, filename + extension) else: - destination_directory = determine_counter_dir(args.output_dir, counter) + destination_dir = determine_counter_dir(args.output_dir, counter) # use cryptographic hash on file contents to define name filename = new_filename or generate_hash_filename(content) - output_path = path.join(destination_directory, filename + extension) - return output_path, destination_directory + + output_path = path.join(destination_dir, filename + extension) + return output_path, destination_dir + def archive_html(htmlstring, args, counter=None): @@ -182,9 +177,9 @@ def write_result(result, args, orig_filename=None, counter=None, new_filename=No if args.output_dir is None: sys.stdout.write(result + '\n') else: - destination_path, destination_directory = determine_output_path(args, orig_filename, result, counter, new_filename) + destination_path, destination_dir = determine_output_path(args, orig_filename, result, counter, new_filename) # check the directory status - if check_outputdir_status(destination_directory) is True: + if check_outputdir_status(destination_dir) is True: with open(destination_path, mode='w', encoding='utf-8') as outputfile: outputfile.write(result) @@ -268,8 +263,8 @@ def cli_discovery(args): def build_exploration_dict(url_store, input_urls, args): "Find domains for which nothing has been found and add info to the crawl dict." - input_domains = set(extract_domain(u) for u in input_urls) - known_domains = set(extract_domain(u) for u in url_store.get_known_domains()) + input_domains = {extract_domain(u) for u in input_urls} + known_domains = {extract_domain(u) for u in url_store.get_known_domains()} still_to_crawl = input_domains - known_domains new_input_urls = [u for u in input_urls if extract_domain(u) in still_to_crawl] control_dict = add_to_compressed_dict( diff --git a/trafilatura/core.py b/trafilatura/core.py index e89a6f81..d15164e7 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -827,8 +827,8 @@ def determine_returnstring(document, output_format, include_formatting, tei_vali else: returnstring = xmltotxt(document.body, include_formatting) if document.commentsbody is not None: - returnstring += '\n' + xmltotxt(document.commentsbody, include_formatting) - returnstring = returnstring.strip() + comments_text = xmltotxt(document.commentsbody, include_formatting) + returnstring = f"{returnstring}\n{comments_text}".strip() # normalize Unicode format (defaults to NFC) return normalize_unicode(returnstring) diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py index aa6b229e..13c17d2e 100644 --- a/trafilatura/downloads.py +++ b/trafilatura/downloads.py @@ -206,28 +206,21 @@ def is_live_page(url): def add_to_compressed_dict(inputlist, blacklist=None, url_filter=None, url_store=None, compression=False, verbose=False): '''Filter, convert input URLs and add them to domain-aware processing dictionary''' - # init if url_store is None: url_store = UrlStore( compressed=compression, strict=False, verbose=verbose ) - # deduplicate while keeping order + inputlist = uniquify_list(inputlist) - # filter + if blacklist: inputlist = [u for u in inputlist if URL_BLACKLIST_REGEX.sub('', u) not in blacklist] + if url_filter: - filtered_list = [] - while inputlist: - u = inputlist.pop() - for f in url_filter: - if f in u: - filtered_list.append(u) - break - inputlist = filtered_list - # validate and store + inputlist = [u for u in inputlist if any(f in u for f in url_filter)] + url_store.add_urls(inputlist) return url_store diff --git a/trafilatura/filters.py b/trafilatura/filters.py index 5f532b23..ca1637b7 100644 --- a/trafilatura/filters.py +++ b/trafilatura/filters.py @@ -61,21 +61,15 @@ def check_html_lang(tree, target_language, strict=False): '''Check HTML meta-elements for language information and split the result in case there are several languages''' # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Language - target_elements = tree.findall('.//meta[@http-equiv="content-language"][@content]') - if target_elements: - for elem in target_elements: - if target_language in RE_HTML_LANG.split(elem.get('content').lower()): - return True - LOGGER.debug('HTML content-language failed') - return False - # locale - target_elements = tree.findall('.//meta[@property="og:locale"][@content]') - if target_elements: - for elem in target_elements: - if target_language in RE_HTML_LANG.split(elem.get('content').lower()): - return True - LOGGER.debug('HTML og:locale failed') - return False + target_attrs = ['http-equiv="content-language"', 'property="og:locale"'] + for attr in target_attrs: + target_elements = tree.findall(f'.//meta[@{attr}][@content]') + if target_elements: + for elem in target_elements: + if target_language in RE_HTML_LANG.split(elem.get('content', '').lower()): + return True + LOGGER.debug('%s failed', attr) + return False # HTML lang attribute: sometimes a wrong indication if strict is True: target_elements = tree.xpath('//html[@lang]') @@ -122,19 +116,13 @@ def language_filter(temp_text, temp_comments, target_language, docmeta): def textfilter(element): '''Filter out unwanted text''' - # print('#', element.text) - if element.text is None and element.tail is not None: - testtext = element.tail - else: - testtext = element.text - if text_chars_test(testtext) is False: - return True + testtext = element.tail if element.text is None else element.text # to check: line len → continue if len(line) <= 5 - return any(RE_FILTER.match(line) for line in testtext.splitlines()) + return not text_chars_test(testtext) or any(map(RE_FILTER.match, testtext.splitlines())) def text_chars_test(string): '''Determine if a string is only composed of spaces and/or control characters''' # or not re.search(r'\w', string) # return string is not None and len(string) != 0 and not string.isspace() - return string not in (None, '') and not string.isspace() + return bool(string) and not string.isspace() diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py index 809c4ccc..47c17596 100644 --- a/trafilatura/htmlprocessing.py +++ b/trafilatura/htmlprocessing.py @@ -43,6 +43,21 @@ ) +REND_TAG_MAPPING = { + 'em': '#i', + 'i': '#i', + 'b': '#b', + 'strong': '#b', + 'u': '#u', + 'kbd': '#t', + 'samp': '#t', + 'tt': '#t', + 'var': '#t', + 'sub': '#sub', + 'sup': '#sup' +} + + def tree_cleaning(tree, options): '''Prune the tree by discarding unwanted elements''' # determine cleaning strategy, use lists to keep it deterministic @@ -117,19 +132,16 @@ def collect_link_info(links_xpath, favor_precision=False): # init shortelems, mylist = 0, [] # longer strings impact recall in favor of precision - if favor_precision is False: - threshold = 10 - else: - threshold = 50 + threshold = 10 if not favor_precision else 50 # examine the elements for subelem in links_xpath: subelemtext = trim(subelem.text_content()) - if not subelemtext: - continue - mylist.append(subelemtext) - lengths = [len(text) for text in mylist] - shortelems = len([l for l in lengths if l < threshold]) - return sum(lengths), len(mylist), shortelems, mylist + if subelemtext: + mylist.append(subelemtext) + if len(subelemtext) < threshold: + shortelems += 1 + lengths = sum(len(text) for text in mylist) + return lengths, len(mylist), shortelems, mylist def link_density_test(element, text, favor_precision=False): @@ -222,10 +234,9 @@ def convert_tags(tree, options, url=None): '''Simplify markup and convert relevant HTML tags to an XML standard''' # delete links for faster processing if options.links is False: + xpath_expr = './/div//a|.//ul//a' # .//p//a ? if options.tables is True: - xpath_expr = './/div//a|.//table//a|.//ul//a' # .//p//a ? - else: - xpath_expr = './/div//a|.//ul//a' # .//p//a ? + xpath_expr += '|.//table//a' # necessary for further detection for elem in tree.xpath(xpath_expr): elem.tag = 'ref' @@ -246,32 +257,12 @@ def convert_tags(tree, options, url=None): elem.set('target', target) # include_formatting if options.formatting is False: - strip_tags(tree, 'em', 'i', 'b', 'strong', 'u', 'kbd', 'samp', 'tt', 'var', 'sub', 'sup') + strip_tags(tree, *REND_TAG_MAPPING) else: - for elem in tree.iter('em', 'i', 'b', 'strong', 'u', 'kbd', 'samp', 'tt', 'var', 'sub', 'sup'): - # italics - if elem.tag in ('em', 'i'): - elem.tag = 'hi' - elem.set('rend', '#i') - # bold font - elif elem.tag in ('b', 'strong'): - elem.tag = 'hi' - elem.set('rend', '#b') - # u (very rare) - elif elem.tag == 'u': - elem.tag = 'hi' - elem.set('rend', '#u') - # tt (very rare) - elif elem.tag in ('kbd', 'samp', 'tt', 'var'): - elem.tag = 'hi' - elem.set('rend', '#t') - # sub and sup (very rare) - elif elem.tag == 'sub': - elem.tag = 'hi' - elem.set('rend', '#sub') - elif elem.tag == 'sup': - elem.tag = 'hi' - elem.set('rend', '#sup') + for elem in tree.iter(list(REND_TAG_MAPPING)): + attribute = REND_TAG_MAPPING[elem.tag] + elem.tag = 'hi' + elem.set('rend', attribute) # iterate over all concerned elements for elem in tree.iter('blockquote', 'br', 'del', 'details', 'dl', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'ol', 'pre', 'q', 's', 'strike', 'ul'): # ul/ol → list / li → item @@ -282,7 +273,7 @@ def convert_tags(tree, options, url=None): for subelem in elem.iter('dd', 'dt', 'li'): # keep track of dd/dt items if subelem.tag in ('dd', 'dt'): - subelem.set('rend', subelem.tag + '-' + str(i)) + subelem.set('rend', f"{subelem.tag}-{i}") # increment counter after
for element in xmldoc.findall(".//text/body//div/lb"): - if element.tail is not None and element.tail.strip(): + if element.tail and element.tail.strip(): element.tag = 'p' element.text = element.tail element.tail = None # look for elements that are not valid for element in xmldoc.findall('.//text/body//*'): - if element.tag in {"ab", "p"} and element.tail and element.tail.strip(): + if element.tag in TEI_REMOVE_TAIL and element.tail and element.tail.strip(): _handle_unwanted_tails(element) # check elements if element.tag not in TEI_VALID_TAGS: @@ -210,7 +202,6 @@ def validate_tei(xmldoc): # , filename="" def replace_element_text(element, include_formatting): '''Determine element text based on text and tail''' - full_text = '' # handle formatting: convert to markdown if include_formatting is True and element.text is not None: if element.tag in ('del', 'head'): @@ -219,53 +210,45 @@ def replace_element_text(element, include_formatting): number = int(element.get('rend')[1]) except (TypeError, ValueError): number = 2 - element.text = ''.join(['#'*number, ' ', element.text]) + element.text = f'{"#" * number} {element.text}' elif element.tag == 'del': - element.text = ''.join(['~~', element.text, '~~']) + element.text = f'~~{element.text}~~' elif element.tag == 'hi': - if element.get('rend') == '#b': - element.text = ''.join(['**', element.text, '**']) - elif element.get('rend') == '#i': - element.text = ''.join(['*', element.text, '*']) - elif element.get('rend') == '#u': - element.text = ''.join(['__', element.text, '__']) - elif element.get('rend') == '#t': - element.text = ''.join(['`', element.text, '`']) + rend = element.get('rend') + if rend in HI_FORMATTING: + element.text = f'{HI_FORMATTING[rend]}{element.text}{HI_FORMATTING[rend]}' # handle links if element.tag == 'ref': if element.text is not None: + link_text = f'[{element.text}]' if element.get('target') is not None: - element.text = ''.join(['[', element.text, ']', '(', element.get('target'), ')']) + element.text = f"{link_text}({element.get('target')})" else: LOGGER.warning('missing link attribute: %s %s', element.text, element.attrib) - element.text = ''.join(['[', element.text, ']']) + element.text = link_text else: LOGGER.warning('empty link: %s %s', element.text, element.attrib) # handle text - if element.text is not None and element.tail is not None: - full_text = ''.join([element.text, element.tail]) - elif element.text is not None: - full_text = element.text - elif element.tail is not None: - full_text = element.tail - return full_text + return (element.text or '') + (element.tail or '') def merge_with_parent(element, include_formatting=False): '''Merge element with its parent and convert formatting to markdown.''' parent = element.getparent() - if parent is None: + if not parent: return + full_text = replace_element_text(element, include_formatting) + previous = element.getprevious() if previous is not None: # There is a previous node, append text to its tail if previous.tail is not None: - previous.tail = ' '.join([previous.tail, full_text]) + previous.tail = f'{previous.tail} {full_text}' else: previous.tail = full_text elif parent.text is not None: - parent.text = ' '.join([parent.text, full_text]) + parent.text = f'{parent.text} {full_text}' else: parent.text = full_text parent.remove(element) @@ -280,10 +263,8 @@ def xmltotxt(xmloutput, include_formatting): if element.text is None and element.tail is None: if element.tag == 'graphic': # add source, default to '' - text = element.get('title', '') - if element.get('alt') is not None: - text += ' ' + element.get('alt') - returnlist.extend(['![', text, ']', '(', element.get('src', ''), ')']) + text = f'{element.get("title", "")} {element.get("alt", "")}' + returnlist.extend(['![', text.strip(), ']', '(', element.get('src', ''), ')']) # newlines for textless elements if element.tag in ('graphic', 'row', 'table'): returnlist.append('\n') @@ -292,12 +273,7 @@ def xmltotxt(xmloutput, include_formatting): textelement = replace_element_text(element, include_formatting) # common elements if element.tag in NEWLINE_ELEMS: - returnlist.extend(['\n', textelement, '\n']) - # particular cases - elif element.tag == 'item': - returnlist.extend(['\n- ', textelement, '\n']) - elif element.tag == 'cell': - returnlist.extend(['|', textelement, '|']) + returnlist.extend([NEWLINE_ELEMS[element.tag], textelement, '\n']) elif element.tag == 'comments': returnlist.append('\n\n') else: @@ -330,7 +306,7 @@ def write_teitree(docmeta): def _define_publisher_string(docmeta): '''Construct a publisher string to include in TEI header''' if docmeta.hostname and docmeta.sitename: - publisherstring = docmeta.sitename.strip() + ' (' + docmeta.hostname + ')' + publisherstring = f'{docmeta.sitename.strip()} ({docmeta.hostname})' elif docmeta.hostname: publisherstring = docmeta.hostname elif docmeta.sitename: @@ -425,19 +401,20 @@ def write_fullheader(teidoc, docmeta): def _handle_text_content_of_div_nodes(element): - if element.text is not None and element.text.strip(): + if element.text and element.text.strip(): if element.getchildren() and element[0].tag == 'p': p_text = element[0].text or "" - element[0].text = ' '.join([element.text, p_text]).strip() + element[0].text = f'{element.text} {p_text}'.strip() else: new_child = Element("p") new_child.text = element.text element.insert(0, new_child) element.text = None - if element.tail is not None and element.tail.strip(): + + if element.tail and element.tail.strip(): if element.getchildren() and element[-1].tag == 'p': p_text = element[-1].text or "" - element[-1].text = ' '.join([p_text, element.tail]).strip() + element[-1].text = f'{p_text} {element.tail}'.strip() else: new_child = Element("p") new_child.text = element.tail @@ -505,9 +482,10 @@ def _wrap_unwanted_siblings_of_div(div_element): def _move_element_one_level_up(element): parent = element.getparent() new_elem = Element("p") - for sibling in element.itersiblings(): - new_elem.append(sibling) + new_elem.extend(sibling for sibling in element.itersiblings()) + parent.addnext(element) + if element.tail is not None and element.tail.strip(): new_elem.text = element.tail.strip() element.tail = None