From 439899dd8b36869ecb2d70be0356c1860ff9570b Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi
Date: Thu, 9 Nov 2023 13:15:07 +0100
Subject: [PATCH] format + type hinting
---
tests/feeds_tests.py | 242 ++++++++++++++++++++++++++++++-------------
trafilatura/feeds.py | 177 ++++++++++++++++++++-----------
2 files changed, 286 insertions(+), 133 deletions(-)
diff --git a/tests/feeds_tests.py b/tests/feeds_tests.py
index f9466465..5b32e888 100644
--- a/tests/feeds_tests.py
+++ b/tests/feeds_tests.py
@@ -13,183 +13,283 @@
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
TEST_DIR = os.path.abspath(os.path.dirname(__file__))
-RESOURCES_DIR = os.path.join(TEST_DIR, 'resources')
+RESOURCES_DIR = os.path.join(TEST_DIR, "resources")
XMLDECL = '\n'
def test_atom_extraction():
- '''Test link extraction from an Atom feed'''
- params = feeds.FeedParameters('https://example.org', 'example.org', '')
+ """Test link extraction from an Atom feed"""
+ params = feeds.FeedParameters("https://example.org", "example.org", "")
assert not feeds.extract_links(None, params)
- assert len(feeds.extract_links('', params)) == 0
+ assert len(feeds.extract_links("", params)) == 0
- filepath = os.path.join(RESOURCES_DIR, 'feed1.atom')
+ filepath = os.path.join(RESOURCES_DIR, "feed1.atom")
with open(filepath, "r", encoding="utf-8") as f:
teststring = f.read()
assert len(feeds.extract_links(teststring, params)) > 0
- params = feeds.FeedParameters('https://www.dwds.de', 'dwds.de', '')
+ params = feeds.FeedParameters("https://www.dwds.de", "dwds.de", "")
assert (
len(
feeds.extract_links(
f'{XMLDECL}',
- params
+ params,
)
)
== 0
)
- params = feeds.FeedParameters('http://example.org', 'example.org', 'http://example.org')
+ params = feeds.FeedParameters(
+ "http://example.org", "example.org", "http://example.org"
+ )
assert (
len(
feeds.extract_links(
f'{XMLDECL}',
- params
+ params,
)
)
== 0
)
- params = feeds.FeedParameters('https://example.org', 'example.org', '')
+ params = feeds.FeedParameters("https://example.org", "example.org", "")
assert (
len(
feeds.extract_links(
f'{XMLDECL}',
- params
+ params,
)
)
== 0
)
- params = feeds.FeedParameters('http://example.org/', 'example.org', 'http://example.org')
+ params = feeds.FeedParameters(
+ "http://example.org/", "example.org", "http://example.org"
+ )
assert feeds.extract_links(
- f'{XMLDECL}',
- params) == ['http://example.org/article1/'] # TODO: remove slash?
+ f'{XMLDECL}', params
+ ) == [
+ "http://example.org/article1/"
+ ] # TODO: remove slash?
def test_rss_extraction():
- '''Test link extraction from a RSS feed'''
- params = feeds.FeedParameters('http://example.org/', 'example.org', '')
+ """Test link extraction from a RSS feed"""
+ params = feeds.FeedParameters("http://example.org/", "example.org", "")
assert (
len(
feeds.extract_links(
- f'{XMLDECL}http://example.org/article1/',
- params
+ f"{XMLDECL}http://example.org/article1/", params
)
)
== 1
)
# CDATA
assert feeds.extract_links(
- f'{XMLDECL}',
- params) == ['http://example.org/article1/'] # TODO: remove slash?
+ f"{XMLDECL}", params
+ ) == [
+ "http://example.org/article1/"
+ ] # TODO: remove slash?
# spaces
- params = feeds.FeedParameters('https://www.ak-kurier.de/', 'ak-kurier.de', '')
- assert len(feeds.extract_links(XMLDECL + '\r\n https://www.ak-kurier.de/akkurier/www/artikel/108815-sinfonisches-blasorchester-spielt-1500-euro-fuer-kinder-in-drk-krankenhaus-kirchen-ein ', params)) == 1
-
- params = feeds.FeedParameters('http://example.org', 'example.org', 'http://example.org')
+ params = feeds.FeedParameters("https://www.ak-kurier.de/", "ak-kurier.de", "")
assert (
len(
feeds.extract_links(
- f'{XMLDECL}http://example.org/',
- params
+ XMLDECL
+ + "\r\n https://www.ak-kurier.de/akkurier/www/artikel/108815-sinfonisches-blasorchester-spielt-1500-euro-fuer-kinder-in-drk-krankenhaus-kirchen-ein ",
+ params,
)
)
+ == 1
+ )
+
+ params = feeds.FeedParameters(
+ "http://example.org", "example.org", "http://example.org"
+ )
+ assert (
+ len(feeds.extract_links(f"{XMLDECL}http://example.org/", params))
== 0
)
- params = feeds.FeedParameters('http://example.org', 'example.org', '')
+ params = feeds.FeedParameters("http://example.org", "example.org", "")
assert (
- len(
- feeds.extract_links(
- f'{XMLDECL}https://example.org',
- params
- )
- )
+ len(feeds.extract_links(f"{XMLDECL}https://example.org", params))
== 0
)
- params = feeds.FeedParameters('https://www.dwds.de', 'dwds.de', 'https://www.dwds.de')
+ params = feeds.FeedParameters(
+ "https://www.dwds.de", "dwds.de", "https://www.dwds.de"
+ )
assert feeds.extract_links(
- f'{XMLDECL}/api/feed/themenglossar/Corona',
- params
- ) == ['https://www.dwds.de/api/feed/themenglossar/Corona']
+ f"{XMLDECL}/api/feed/themenglossar/Corona", params
+ ) == ["https://www.dwds.de/api/feed/themenglossar/Corona"]
- params = feeds.FeedParameters('https://example.org', 'example.org', '')
- filepath = os.path.join(RESOURCES_DIR, 'feed2.rss')
+ params = feeds.FeedParameters("https://example.org", "example.org", "")
+ filepath = os.path.join(RESOURCES_DIR, "feed2.rss")
with open(filepath, "r", encoding="utf-8") as f:
teststring = f.read()
assert len(feeds.extract_links(teststring, params)) > 0
def test_json_extraction():
- '''Test link extraction from a JSON feed'''
+ """Test link extraction from a JSON feed"""
# find link
- params = feeds.FeedParameters('https://www.jsonfeed.org', 'jsonfeed.org', '')
- assert len(feeds.determine_feed('>', params)) == 1
+ params = feeds.FeedParameters("https://www.jsonfeed.org", "jsonfeed.org", "")
+ assert (
+ len(
+ feeds.determine_feed(
+ '>