diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py index 0db1f3d8..bb4601d0 100644 --- a/trafilatura/cli_utils.py +++ b/trafilatura/cli_utils.py @@ -236,10 +236,11 @@ def cli_discovery(args): input_urls = url_store.dump_urls() if args.list: url_store.reset() + ext = use_config(filename=args.config_file).getboolean('DEFAULT', 'EXTERNAL_URLS') # link discovery and storage with ThreadPoolExecutor(max_workers=args.parallel) as executor: - futures = (executor.submit(func, url, target_lang=args.target_language) for url in input_urls) + futures = (executor.submit(func, url, target_lang=args.target_language, external=ext) for url in input_urls) # process results from the parallel threads and add them # to the compressed URL dictionary for further processing for future in as_completed(futures): diff --git a/trafilatura/feeds.py b/trafilatura/feeds.py index 21d2b74e..4172ed3b 100644 --- a/trafilatura/feeds.py +++ b/trafilatura/feeds.py @@ -27,7 +27,7 @@ BLACKLIST = re.compile(r'\bcomments\b') # no comment feed -def handle_link_list(linklist, domainname, baseurl, target_lang=None): +def handle_link_list(linklist, domainname, baseurl, target_lang=None, external=False): '''Examine links to determine if they are valid and lead to a web page''' output_links = [] @@ -38,7 +38,7 @@ def handle_link_list(linklist, domainname, baseurl, target_lang=None): # control output for validity checked = check_url(link, language=target_lang) if checked is not None: - if not is_similar_domain(domainname, checked[1]) and not "feed" in link: + if not external and not "feed" in link and not is_similar_domain(domainname, checked[1]): LOGGER.warning('Rejected, diverging domain names: %s %s', domainname, checked[1]) else: output_links.append(checked[0]) @@ -48,7 +48,7 @@ def handle_link_list(linklist, domainname, baseurl, target_lang=None): return output_links -def extract_links(feed_string, domainname, baseurl, reference, target_lang=None): +def extract_links(feed_string, domainname, baseurl, reference, target_lang=None, external=False): '''Extract links from Atom and RSS feeds''' feed_links = [] # check if it's a feed @@ -91,7 +91,7 @@ def extract_links(feed_string, domainname, baseurl, reference, target_lang=None) ) # refine - output_links = handle_link_list(feed_links, domainname, baseurl, target_lang) + output_links = handle_link_list(feed_links, domainname, baseurl, target_lang, external) output_links = [l for l in output_links if l != reference and l.count('/') > 2] # log result if feed_links: @@ -145,14 +145,16 @@ def determine_feed(htmlstring, baseurl, reference): return output_urls -def find_feed_urls(url, target_lang=None): +def find_feed_urls(url, target_lang=None, external=False): """Try to find feed URLs. Args: url: Webpage or feed URL as string. Triggers URL-based filter if the webpage isn't a homepage. target_lang: Define a language to filter URLs based on heuristics - (two-letter string, ISO 639-1 format). + (two-letter string, ISO 639-1 format). + external: Similar hosts only or external URLs + (boolean, defaults to False). Returns: The extracted links as a list (sorted list of unique links). @@ -166,12 +168,12 @@ def find_feed_urls(url, target_lang=None): downloaded = fetch_url(url) if downloaded is not None: # assume it's a feed - feed_links = extract_links(downloaded, domainname, baseurl, url, target_lang) + feed_links = extract_links(downloaded, domainname, baseurl, url, target_lang, external) if len(feed_links) == 0: # assume it's a web page for feed in determine_feed(downloaded, baseurl, url): feed_string = fetch_url(feed) - feed_links.extend(extract_links(feed_string, domainname, baseurl, url, target_lang)) + feed_links.extend(extract_links(feed_string, domainname, baseurl, url, target_lang, external)) # filter triggered, prepare it if len(url) > len(baseurl) + 2: urlfilter = url @@ -191,7 +193,7 @@ def find_feed_urls(url, target_lang=None): f'https://news.google.com/rss/search?q=site:{baseurl}&hl={target_lang}&scoring=n&num=100' ) if downloaded is not None: - feed_links = extract_links(downloaded, domainname, baseurl, url, target_lang) + feed_links = extract_links(downloaded, domainname, baseurl, url, target_lang, external) feed_links = filter_urls(feed_links, urlfilter) LOGGER.debug('%s Google news links found for %s', len(feed_links), domainname) return feed_links diff --git a/trafilatura/settings.cfg b/trafilatura/settings.cfg index 457d4ff4..729f28c9 100644 --- a/trafilatura/settings.cfg +++ b/trafilatura/settings.cfg @@ -28,3 +28,6 @@ MAX_REPETITIONS = 2 # Extraction option for Htmldate EXTENSIVE_DATE_SEARCH = on + +# URLs in feeds and sitemaps +EXTERNAL_URLS = off diff --git a/trafilatura/sitemaps.py b/trafilatura/sitemaps.py index b17178a1..1a974e91 100644 --- a/trafilatura/sitemaps.py +++ b/trafilatura/sitemaps.py @@ -11,8 +11,14 @@ from itertools import islice from typing import List, Optional -from courlan import (clean_url, extract_domain, filter_urls, fix_relative_urls, - get_hostinfo, lang_filter) +from courlan import ( + clean_url, + extract_domain, + filter_urls, + fix_relative_urls, + get_hostinfo, + lang_filter, +) from .downloads import fetch_url, is_live_page from .settings import MAX_LINKS, MAX_SITEMAPS_SEEN @@ -22,33 +28,49 @@ # ROBOT_PARSER = urllib.robotparser.RobotFileParser() - - - LOGGER = logging.getLogger(__name__) -LINK_REGEX = re.compile(r'(?:)?') -XHTML_REGEX = re.compile(r'', re.DOTALL) +LINK_REGEX = re.compile(r"(?:)?") +XHTML_REGEX = re.compile(r"", re.DOTALL) HREFLANG_REGEX = re.compile(r'href=["\'](.+?)["\']') -WHITELISTED_PLATFORMS = re.compile(r'(?:blogger|blogpost|ghost|hubspot|livejournal|medium|typepad|squarespace|tumblr|weebly|wix|wordpress)\.') +WHITELISTED_PLATFORMS = re.compile( + r"(?:blogger|blogpost|ghost|hubspot|livejournal|medium|typepad|squarespace|tumblr|weebly|wix|wordpress)\." +) -SITEMAP_FORMAT = re.compile(r'^.{0,5}<\?xml| None: + __slots__ = [ + "base_url", + "content", + "domain", + "external", + "sitemap_url", + "sitemap_urls", + "target_lang", + "urls", + ] + + def __init__( + self, + base_url: str, + domain: str, + sitemap_url: str, + target_lang: Optional[str] = None, + external: bool = False, + ) -> None: self.base_url: str = base_url self.content: str = "" self.domain: str = domain + self.external: bool = external self.sitemap_url: str = sitemap_url self.sitemap_urls: List[str] = [] self.target_lang: Optional[str] = target_lang @@ -56,12 +78,12 @@ def __init__(self, base_url: str, domain: str, sitemap_url: str, target_lang: Op def fetch(self) -> None: "Fetch a sitemap over the network." - LOGGER.debug('fetching sitemap: %s', self.sitemap_url) + LOGGER.debug("fetching sitemap: %s", self.sitemap_url) self.content = fetch_url(self.sitemap_url) def handle_link(self, link: str) -> None: """Examine a link and determine if it's valid and if it leads to - a sitemap or a web page.""" + a sitemap or a web page.""" if link == self.sitemap_url: # safety check return # fix, check, clean and normalize @@ -78,8 +100,14 @@ def handle_link(self, link: str) -> None: # don't take links from another domain and make an exception for main platforms # also bypass: subdomains vs. domains - if not is_similar_domain(self.domain, newdomain) and not WHITELISTED_PLATFORMS.search(newdomain): - LOGGER.warning('link discarded, diverging domain names: %s %s', self.domain, newdomain) + if ( + not self.external + and not WHITELISTED_PLATFORMS.search(newdomain) + and not is_similar_domain(self.domain, newdomain) + ): + LOGGER.warning( + "link discarded, diverging domain names: %s %s", self.domain, newdomain + ) return if DETECT_SITEMAP_LINK.search(link): @@ -89,25 +117,41 @@ def handle_link(self, link: str) -> None: def extract_sitemap_langlinks(self) -> None: "Extract links corresponding to a given target language." - if 'hreflang=' not in self.content: + if "hreflang=" not in self.content: return # compile regex here for modularity and efficiency - lang_regex = re.compile(rf"hreflang=[\"']({self.target_lang}.*?|x-default)[\"']", re.DOTALL) + lang_regex = re.compile( + rf"hreflang=[\"']({self.target_lang}.*?|x-default)[\"']", re.DOTALL + ) # extract - for attrs in (m[0] for m in islice(XHTML_REGEX.finditer(self.content), MAX_LINKS)): + for attrs in ( + m[0] for m in islice(XHTML_REGEX.finditer(self.content), MAX_LINKS) + ): if lang_regex.search(attrs): lang_match = HREFLANG_REGEX.search(attrs) if lang_match: self.handle_link(lang_match[1]) - LOGGER.debug('%s sitemaps and %s links with hreflang found for %s', len(self.sitemap_urls), len(self.urls), self.sitemap_url) + LOGGER.debug( + "%s sitemaps and %s links with hreflang found for %s", + len(self.sitemap_urls), + len(self.urls), + self.sitemap_url, + ) def extract_sitemap_links(self) -> None: "Extract sitemap links and web page links from a sitemap file." # extract - for match in (m[1] for m in islice(LINK_REGEX.finditer(self.content), MAX_LINKS)): + for match in ( + m[1] for m in islice(LINK_REGEX.finditer(self.content), MAX_LINKS) + ): # process middle part of the match tuple self.handle_link(match) - LOGGER.debug('%s sitemaps and %s links found for %s', len(self.sitemap_urls), len(self.urls), self.sitemap_url) + LOGGER.debug( + "%s sitemaps and %s links found for %s", + len(self.sitemap_urls), + len(self.urls), + self.sitemap_url, + ) def process(self) -> None: "Download a sitemap and extract the links it contains." @@ -117,7 +161,9 @@ def process(self) -> None: return # try to extract links from TXT file if not SITEMAP_FORMAT.match(self.content): - for match in (m[0] for m in islice(DETECT_LINKS.finditer(self.content), MAX_LINKS)): + for match in ( + m[0] for m in islice(DETECT_LINKS.finditer(self.content), MAX_LINKS) + ): self.handle_link(match) return # process XML sitemap @@ -128,14 +174,18 @@ def process(self) -> None: self.extract_sitemap_links() -def sitemap_search(url: str, target_lang: Optional[str] = None) -> List[str]: +def sitemap_search( + url: str, target_lang: Optional[str] = None, external: bool = False +) -> List[str]: """Look for sitemaps for the given URL and gather links. Args: url: Webpage or sitemap URL as string. Triggers URL-based filter if the webpage isn't a homepage. target_lang: Define a language to filter URLs based on heuristics - (two-letter string, ISO 639-1 format). + (two-letter string, ISO 639-1 format). + external: Similar hosts only or external URLs + (boolean, defaults to False). Returns: The extracted links as a list (sorted list of unique links). @@ -143,29 +193,29 @@ def sitemap_search(url: str, target_lang: Optional[str] = None) -> List[str]: """ domainname, baseurl = get_hostinfo(url) if domainname is None: - LOGGER.warning('invalid URL: %s', url) + LOGGER.warning("invalid URL: %s", url) return [] if not is_live_page(baseurl): - LOGGER.warning('base URL unreachable, dropping sitemap: %s', url) + LOGGER.warning("base URL unreachable, dropping sitemap: %s", url) return [] urlfilter = None - if url.endswith(('.gz', 'sitemap', '.xml')): + if url.endswith((".gz", "sitemap", ".xml")): sitemapurl = url else: - sitemapurl = baseurl + '/sitemap.xml' + sitemapurl = baseurl + "/sitemap.xml" # filter triggered, prepare it if len(url) > len(baseurl) + 2: urlfilter = url - sitemap = SitemapObject(baseurl, domainname, sitemapurl, target_lang) + sitemap = SitemapObject(baseurl, domainname, sitemapurl, target_lang, external) sitemap.fetch() sitemap.process() if not sitemap.sitemap_urls and sitemap.urls: linklist = filter_urls(sitemap.urls, urlfilter) - LOGGER.debug('%s sitemap links found for %s', len(linklist), domainname) + LOGGER.debug("%s sitemap links found for %s", len(linklist), domainname) return linklist # try sitemaps in robots.txt file if nothing has been found @@ -173,7 +223,7 @@ def sitemap_search(url: str, target_lang: Optional[str] = None) -> List[str]: sitemap.sitemap_urls = find_robots_sitemaps(baseurl) # try additional URLs just in case if not sitemap.sitemap_urls: - sitemap.sitemap_urls = [''.join([baseurl, '/', g]) for g in GUESSES] + sitemap.sitemap_urls = ["".join([baseurl, "/", g]) for g in GUESSES] # iterate through nested sitemaps and results seen = {sitemapurl} @@ -191,38 +241,40 @@ def sitemap_search(url: str, target_lang: Optional[str] = None) -> List[str]: break sitemap.urls = filter_urls(sitemap.urls, urlfilter) - LOGGER.debug('%s sitemap links found for %s', len(sitemap.urls), domainname) + LOGGER.debug("%s sitemap links found for %s", len(sitemap.urls), domainname) return sitemap.urls def is_plausible_sitemap(url: str, contents: Optional[str]) -> bool: - '''Check if the sitemap corresponds to an expected format, - i.e. TXT or XML.''' + """Check if the sitemap corresponds to an expected format, + i.e. TXT or XML.""" if contents is None: return False # strip query and fragments - url = SCRUB_REGEX.sub('', url) + url = SCRUB_REGEX.sub("", url) # check content - if POTENTIAL_SITEMAP.search(url) and \ - (not isinstance(contents, str) or not SITEMAP_FORMAT.match(contents)) \ - or ' List[str]: - '''Guess the location of the robots.txt file and try to extract - sitemap URLs from it''' - robotstxt = fetch_url(baseurl + '/robots.txt') + """Guess the location of the robots.txt file and try to extract + sitemap URLs from it""" + robotstxt = fetch_url(baseurl + "/robots.txt") return extract_robots_sitemaps(robotstxt, baseurl) def extract_robots_sitemaps(robotstxt: str, baseurl: str) -> List[str]: - 'Read a robots.txt file and find sitemap links.' + "Read a robots.txt file and find sitemap links." # sanity check on length (cause: redirections) if robotstxt is None or len(robotstxt) > 10000: return [] @@ -230,18 +282,18 @@ def extract_robots_sitemaps(robotstxt: str, baseurl: str) -> List[str]: # source: https://github.com/python/cpython/blob/3.8/Lib/urllib/robotparser.py for line in robotstxt.splitlines(): # remove optional comment and strip line - i = line.find('#') + i = line.find("#") if i >= 0: line = line[:i] line = line.strip() if not line: continue - line = line.split(':', 1) + line = line.split(":", 1) if len(line) == 2: line[0] = line[0].strip().lower() if line[0] == "sitemap": # urllib.parse.unquote(line[1].strip()) candidate = fix_relative_urls(baseurl, line[1].strip()) sitemapurls.append(candidate) - LOGGER.debug('%s sitemaps found in robots.txt', len(sitemapurls)) + LOGGER.debug("%s sitemaps found in robots.txt", len(sitemapurls)) return sitemapurls