diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
index 0db1f3d8..bb4601d0 100644
--- a/trafilatura/cli_utils.py
+++ b/trafilatura/cli_utils.py
@@ -236,10 +236,11 @@ def cli_discovery(args):
input_urls = url_store.dump_urls()
if args.list:
url_store.reset()
+ ext = use_config(filename=args.config_file).getboolean('DEFAULT', 'EXTERNAL_URLS')
# link discovery and storage
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
- futures = (executor.submit(func, url, target_lang=args.target_language) for url in input_urls)
+ futures = (executor.submit(func, url, target_lang=args.target_language, external=ext) for url in input_urls)
# process results from the parallel threads and add them
# to the compressed URL dictionary for further processing
for future in as_completed(futures):
diff --git a/trafilatura/feeds.py b/trafilatura/feeds.py
index 21d2b74e..4172ed3b 100644
--- a/trafilatura/feeds.py
+++ b/trafilatura/feeds.py
@@ -27,7 +27,7 @@
BLACKLIST = re.compile(r'\bcomments\b') # no comment feed
-def handle_link_list(linklist, domainname, baseurl, target_lang=None):
+def handle_link_list(linklist, domainname, baseurl, target_lang=None, external=False):
'''Examine links to determine if they are valid and
lead to a web page'''
output_links = []
@@ -38,7 +38,7 @@ def handle_link_list(linklist, domainname, baseurl, target_lang=None):
# control output for validity
checked = check_url(link, language=target_lang)
if checked is not None:
- if not is_similar_domain(domainname, checked[1]) and not "feed" in link:
+ if not external and not "feed" in link and not is_similar_domain(domainname, checked[1]):
LOGGER.warning('Rejected, diverging domain names: %s %s', domainname, checked[1])
else:
output_links.append(checked[0])
@@ -48,7 +48,7 @@ def handle_link_list(linklist, domainname, baseurl, target_lang=None):
return output_links
-def extract_links(feed_string, domainname, baseurl, reference, target_lang=None):
+def extract_links(feed_string, domainname, baseurl, reference, target_lang=None, external=False):
'''Extract links from Atom and RSS feeds'''
feed_links = []
# check if it's a feed
@@ -91,7 +91,7 @@ def extract_links(feed_string, domainname, baseurl, reference, target_lang=None)
)
# refine
- output_links = handle_link_list(feed_links, domainname, baseurl, target_lang)
+ output_links = handle_link_list(feed_links, domainname, baseurl, target_lang, external)
output_links = [l for l in output_links if l != reference and l.count('/') > 2]
# log result
if feed_links:
@@ -145,14 +145,16 @@ def determine_feed(htmlstring, baseurl, reference):
return output_urls
-def find_feed_urls(url, target_lang=None):
+def find_feed_urls(url, target_lang=None, external=False):
"""Try to find feed URLs.
Args:
url: Webpage or feed URL as string.
Triggers URL-based filter if the webpage isn't a homepage.
target_lang: Define a language to filter URLs based on heuristics
- (two-letter string, ISO 639-1 format).
+ (two-letter string, ISO 639-1 format).
+ external: Similar hosts only or external URLs
+ (boolean, defaults to False).
Returns:
The extracted links as a list (sorted list of unique links).
@@ -166,12 +168,12 @@ def find_feed_urls(url, target_lang=None):
downloaded = fetch_url(url)
if downloaded is not None:
# assume it's a feed
- feed_links = extract_links(downloaded, domainname, baseurl, url, target_lang)
+ feed_links = extract_links(downloaded, domainname, baseurl, url, target_lang, external)
if len(feed_links) == 0:
# assume it's a web page
for feed in determine_feed(downloaded, baseurl, url):
feed_string = fetch_url(feed)
- feed_links.extend(extract_links(feed_string, domainname, baseurl, url, target_lang))
+ feed_links.extend(extract_links(feed_string, domainname, baseurl, url, target_lang, external))
# filter triggered, prepare it
if len(url) > len(baseurl) + 2:
urlfilter = url
@@ -191,7 +193,7 @@ def find_feed_urls(url, target_lang=None):
f'https://news.google.com/rss/search?q=site:{baseurl}&hl={target_lang}&scoring=n&num=100'
)
if downloaded is not None:
- feed_links = extract_links(downloaded, domainname, baseurl, url, target_lang)
+ feed_links = extract_links(downloaded, domainname, baseurl, url, target_lang, external)
feed_links = filter_urls(feed_links, urlfilter)
LOGGER.debug('%s Google news links found for %s', len(feed_links), domainname)
return feed_links
diff --git a/trafilatura/settings.cfg b/trafilatura/settings.cfg
index 457d4ff4..729f28c9 100644
--- a/trafilatura/settings.cfg
+++ b/trafilatura/settings.cfg
@@ -28,3 +28,6 @@ MAX_REPETITIONS = 2
# Extraction option for Htmldate
EXTENSIVE_DATE_SEARCH = on
+
+# URLs in feeds and sitemaps
+EXTERNAL_URLS = off
diff --git a/trafilatura/sitemaps.py b/trafilatura/sitemaps.py
index b17178a1..1a974e91 100644
--- a/trafilatura/sitemaps.py
+++ b/trafilatura/sitemaps.py
@@ -11,8 +11,14 @@
from itertools import islice
from typing import List, Optional
-from courlan import (clean_url, extract_domain, filter_urls, fix_relative_urls,
- get_hostinfo, lang_filter)
+from courlan import (
+ clean_url,
+ extract_domain,
+ filter_urls,
+ fix_relative_urls,
+ get_hostinfo,
+ lang_filter,
+)
from .downloads import fetch_url, is_live_page
from .settings import MAX_LINKS, MAX_SITEMAPS_SEEN
@@ -22,33 +28,49 @@
# ROBOT_PARSER = urllib.robotparser.RobotFileParser()
-
-
-
LOGGER = logging.getLogger(__name__)
-LINK_REGEX = re.compile(r'(?:)?')
-XHTML_REGEX = re.compile(r'', re.DOTALL)
+LINK_REGEX = re.compile(r"(?:)?")
+XHTML_REGEX = re.compile(r"", re.DOTALL)
HREFLANG_REGEX = re.compile(r'href=["\'](.+?)["\']')
-WHITELISTED_PLATFORMS = re.compile(r'(?:blogger|blogpost|ghost|hubspot|livejournal|medium|typepad|squarespace|tumblr|weebly|wix|wordpress)\.')
+WHITELISTED_PLATFORMS = re.compile(
+ r"(?:blogger|blogpost|ghost|hubspot|livejournal|medium|typepad|squarespace|tumblr|weebly|wix|wordpress)\."
+)
-SITEMAP_FORMAT = re.compile(r'^.{0,5}<\?xml| None:
+ __slots__ = [
+ "base_url",
+ "content",
+ "domain",
+ "external",
+ "sitemap_url",
+ "sitemap_urls",
+ "target_lang",
+ "urls",
+ ]
+
+ def __init__(
+ self,
+ base_url: str,
+ domain: str,
+ sitemap_url: str,
+ target_lang: Optional[str] = None,
+ external: bool = False,
+ ) -> None:
self.base_url: str = base_url
self.content: str = ""
self.domain: str = domain
+ self.external: bool = external
self.sitemap_url: str = sitemap_url
self.sitemap_urls: List[str] = []
self.target_lang: Optional[str] = target_lang
@@ -56,12 +78,12 @@ def __init__(self, base_url: str, domain: str, sitemap_url: str, target_lang: Op
def fetch(self) -> None:
"Fetch a sitemap over the network."
- LOGGER.debug('fetching sitemap: %s', self.sitemap_url)
+ LOGGER.debug("fetching sitemap: %s", self.sitemap_url)
self.content = fetch_url(self.sitemap_url)
def handle_link(self, link: str) -> None:
"""Examine a link and determine if it's valid and if it leads to
- a sitemap or a web page."""
+ a sitemap or a web page."""
if link == self.sitemap_url: # safety check
return
# fix, check, clean and normalize
@@ -78,8 +100,14 @@ def handle_link(self, link: str) -> None:
# don't take links from another domain and make an exception for main platforms
# also bypass: subdomains vs. domains
- if not is_similar_domain(self.domain, newdomain) and not WHITELISTED_PLATFORMS.search(newdomain):
- LOGGER.warning('link discarded, diverging domain names: %s %s', self.domain, newdomain)
+ if (
+ not self.external
+ and not WHITELISTED_PLATFORMS.search(newdomain)
+ and not is_similar_domain(self.domain, newdomain)
+ ):
+ LOGGER.warning(
+ "link discarded, diverging domain names: %s %s", self.domain, newdomain
+ )
return
if DETECT_SITEMAP_LINK.search(link):
@@ -89,25 +117,41 @@ def handle_link(self, link: str) -> None:
def extract_sitemap_langlinks(self) -> None:
"Extract links corresponding to a given target language."
- if 'hreflang=' not in self.content:
+ if "hreflang=" not in self.content:
return
# compile regex here for modularity and efficiency
- lang_regex = re.compile(rf"hreflang=[\"']({self.target_lang}.*?|x-default)[\"']", re.DOTALL)
+ lang_regex = re.compile(
+ rf"hreflang=[\"']({self.target_lang}.*?|x-default)[\"']", re.DOTALL
+ )
# extract
- for attrs in (m[0] for m in islice(XHTML_REGEX.finditer(self.content), MAX_LINKS)):
+ for attrs in (
+ m[0] for m in islice(XHTML_REGEX.finditer(self.content), MAX_LINKS)
+ ):
if lang_regex.search(attrs):
lang_match = HREFLANG_REGEX.search(attrs)
if lang_match:
self.handle_link(lang_match[1])
- LOGGER.debug('%s sitemaps and %s links with hreflang found for %s', len(self.sitemap_urls), len(self.urls), self.sitemap_url)
+ LOGGER.debug(
+ "%s sitemaps and %s links with hreflang found for %s",
+ len(self.sitemap_urls),
+ len(self.urls),
+ self.sitemap_url,
+ )
def extract_sitemap_links(self) -> None:
"Extract sitemap links and web page links from a sitemap file."
# extract
- for match in (m[1] for m in islice(LINK_REGEX.finditer(self.content), MAX_LINKS)):
+ for match in (
+ m[1] for m in islice(LINK_REGEX.finditer(self.content), MAX_LINKS)
+ ):
# process middle part of the match tuple
self.handle_link(match)
- LOGGER.debug('%s sitemaps and %s links found for %s', len(self.sitemap_urls), len(self.urls), self.sitemap_url)
+ LOGGER.debug(
+ "%s sitemaps and %s links found for %s",
+ len(self.sitemap_urls),
+ len(self.urls),
+ self.sitemap_url,
+ )
def process(self) -> None:
"Download a sitemap and extract the links it contains."
@@ -117,7 +161,9 @@ def process(self) -> None:
return
# try to extract links from TXT file
if not SITEMAP_FORMAT.match(self.content):
- for match in (m[0] for m in islice(DETECT_LINKS.finditer(self.content), MAX_LINKS)):
+ for match in (
+ m[0] for m in islice(DETECT_LINKS.finditer(self.content), MAX_LINKS)
+ ):
self.handle_link(match)
return
# process XML sitemap
@@ -128,14 +174,18 @@ def process(self) -> None:
self.extract_sitemap_links()
-def sitemap_search(url: str, target_lang: Optional[str] = None) -> List[str]:
+def sitemap_search(
+ url: str, target_lang: Optional[str] = None, external: bool = False
+) -> List[str]:
"""Look for sitemaps for the given URL and gather links.
Args:
url: Webpage or sitemap URL as string.
Triggers URL-based filter if the webpage isn't a homepage.
target_lang: Define a language to filter URLs based on heuristics
- (two-letter string, ISO 639-1 format).
+ (two-letter string, ISO 639-1 format).
+ external: Similar hosts only or external URLs
+ (boolean, defaults to False).
Returns:
The extracted links as a list (sorted list of unique links).
@@ -143,29 +193,29 @@ def sitemap_search(url: str, target_lang: Optional[str] = None) -> List[str]:
"""
domainname, baseurl = get_hostinfo(url)
if domainname is None:
- LOGGER.warning('invalid URL: %s', url)
+ LOGGER.warning("invalid URL: %s", url)
return []
if not is_live_page(baseurl):
- LOGGER.warning('base URL unreachable, dropping sitemap: %s', url)
+ LOGGER.warning("base URL unreachable, dropping sitemap: %s", url)
return []
urlfilter = None
- if url.endswith(('.gz', 'sitemap', '.xml')):
+ if url.endswith((".gz", "sitemap", ".xml")):
sitemapurl = url
else:
- sitemapurl = baseurl + '/sitemap.xml'
+ sitemapurl = baseurl + "/sitemap.xml"
# filter triggered, prepare it
if len(url) > len(baseurl) + 2:
urlfilter = url
- sitemap = SitemapObject(baseurl, domainname, sitemapurl, target_lang)
+ sitemap = SitemapObject(baseurl, domainname, sitemapurl, target_lang, external)
sitemap.fetch()
sitemap.process()
if not sitemap.sitemap_urls and sitemap.urls:
linklist = filter_urls(sitemap.urls, urlfilter)
- LOGGER.debug('%s sitemap links found for %s', len(linklist), domainname)
+ LOGGER.debug("%s sitemap links found for %s", len(linklist), domainname)
return linklist
# try sitemaps in robots.txt file if nothing has been found
@@ -173,7 +223,7 @@ def sitemap_search(url: str, target_lang: Optional[str] = None) -> List[str]:
sitemap.sitemap_urls = find_robots_sitemaps(baseurl)
# try additional URLs just in case
if not sitemap.sitemap_urls:
- sitemap.sitemap_urls = [''.join([baseurl, '/', g]) for g in GUESSES]
+ sitemap.sitemap_urls = ["".join([baseurl, "/", g]) for g in GUESSES]
# iterate through nested sitemaps and results
seen = {sitemapurl}
@@ -191,38 +241,40 @@ def sitemap_search(url: str, target_lang: Optional[str] = None) -> List[str]:
break
sitemap.urls = filter_urls(sitemap.urls, urlfilter)
- LOGGER.debug('%s sitemap links found for %s', len(sitemap.urls), domainname)
+ LOGGER.debug("%s sitemap links found for %s", len(sitemap.urls), domainname)
return sitemap.urls
def is_plausible_sitemap(url: str, contents: Optional[str]) -> bool:
- '''Check if the sitemap corresponds to an expected format,
- i.e. TXT or XML.'''
+ """Check if the sitemap corresponds to an expected format,
+ i.e. TXT or XML."""
if contents is None:
return False
# strip query and fragments
- url = SCRUB_REGEX.sub('', url)
+ url = SCRUB_REGEX.sub("", url)
# check content
- if POTENTIAL_SITEMAP.search(url) and \
- (not isinstance(contents, str) or not SITEMAP_FORMAT.match(contents)) \
- or ' List[str]:
- '''Guess the location of the robots.txt file and try to extract
- sitemap URLs from it'''
- robotstxt = fetch_url(baseurl + '/robots.txt')
+ """Guess the location of the robots.txt file and try to extract
+ sitemap URLs from it"""
+ robotstxt = fetch_url(baseurl + "/robots.txt")
return extract_robots_sitemaps(robotstxt, baseurl)
def extract_robots_sitemaps(robotstxt: str, baseurl: str) -> List[str]:
- 'Read a robots.txt file and find sitemap links.'
+ "Read a robots.txt file and find sitemap links."
# sanity check on length (cause: redirections)
if robotstxt is None or len(robotstxt) > 10000:
return []
@@ -230,18 +282,18 @@ def extract_robots_sitemaps(robotstxt: str, baseurl: str) -> List[str]:
# source: https://github.com/python/cpython/blob/3.8/Lib/urllib/robotparser.py
for line in robotstxt.splitlines():
# remove optional comment and strip line
- i = line.find('#')
+ i = line.find("#")
if i >= 0:
line = line[:i]
line = line.strip()
if not line:
continue
- line = line.split(':', 1)
+ line = line.split(":", 1)
if len(line) == 2:
line[0] = line[0].strip().lower()
if line[0] == "sitemap":
# urllib.parse.unquote(line[1].strip())
candidate = fix_relative_urls(baseurl, line[1].strip())
sitemapurls.append(candidate)
- LOGGER.debug('%s sitemaps found in robots.txt', len(sitemapurls))
+ LOGGER.debug("%s sitemaps found in robots.txt", len(sitemapurls))
return sitemapurls