From 1e5672c9981f442f13890ef71963575ad464dfeb Mon Sep 17 00:00:00 2001 From: "Aliaksei Yaletski (Tiendil)" Date: Fri, 27 Dec 2024 17:42:43 +0100 Subject: [PATCH 1/3] boilerplate --- ffun/ffun/domain/urls.py | 8 ++-- ffun/ffun/feeds_discoverer/domain.py | 40 ++++++++++++++++++ .../feeds_discoverer/tests/test_domain.py | 41 +++++++++++++++++++ 3 files changed, 85 insertions(+), 4 deletions(-) diff --git a/ffun/ffun/domain/urls.py b/ffun/ffun/domain/urls.py index ecd354f9..e04bb1d8 100644 --- a/ffun/ffun/domain/urls.py +++ b/ffun/ffun/domain/urls.py @@ -37,7 +37,7 @@ def check_furl_error() -> Iterator[None]: raise -def _construct_f_url(url: UnknownUrl | AbsoluteUrl | str) -> furl | None: +def construct_f_url(url: UnknownUrl | AbsoluteUrl | str) -> furl | None: try: with check_furl_error(): return furl(url) @@ -63,7 +63,7 @@ def _fix_classic_url_to_absolute(url: str) -> AbsoluteUrl | None: if tldextract.extract(domain_part).suffix == "": return None - f_url = _construct_f_url(f"//{url}") + f_url = construct_f_url(f"//{url}") if f_url is None: return None @@ -76,7 +76,7 @@ def normalize_classic_unknown_url(url: UnknownUrl) -> AbsoluteUrl | None: url = UnknownUrl(url.strip()) # check if url is parsable - f_url = _construct_f_url(url) + f_url = construct_f_url(url) if f_url is None: return None @@ -145,7 +145,7 @@ def adjust_classic_full_url(url: UnknownUrl, original_url: AbsoluteUrl | FeedUrl # ATTENTION: see note at the top of the file def adjust_classic_relative_url(url: UnknownUrl, original_url: AbsoluteUrl | FeedUrl) -> AbsoluteUrl | None: - f_url = _construct_f_url(original_url) + f_url = construct_f_url(original_url) if f_url is None: return None diff --git a/ffun/ffun/feeds_discoverer/domain.py b/ffun/ffun/feeds_discoverer/domain.py index a09ee700..720907e9 100644 --- a/ffun/ffun/feeds_discoverer/domain.py +++ b/ffun/ffun/feeds_discoverer/domain.py @@ -1,6 +1,7 @@ import asyncio from bs4 import BeautifulSoup +import re from ffun.core import logging from ffun.domain.entities import AbsoluteUrl, UnknownUrl @@ -11,6 +12,7 @@ normalize_classic_unknown_url, to_feed_url, url_has_extension, + construct_f_url, ) from ffun.feeds_discoverer.entities import Context, Discoverer, Result, Status from ffun.loader import domain as lo_domain @@ -181,6 +183,42 @@ async def _discover_stop_recursion(context: Context) -> tuple[Context, Result | return context, None +_RE_REDDIT_PATH_PREFIX = re.compile(r"^/r/[^/]+/?") + + +async def _discover_extract_feeds_for_reddit(context: Context) -> tuple[Context, Result | None]: + """New Reddit site has no links to RSS feeds => we construct them.""" + assert context.url is not None + + f_url = construct_f_url(context.url) + + assert f_url is not None + + if f_url.host not in ("www.reddit.com", "reddit.com", "old.reddit.com"): + # We are not interested in not reddit.com domains + return context, None + + if f_url.host == "old.reddit.com": + # Old Reddit site has marked RSS urls in the header + return context, None + + match = _RE_REDDIT_PATH_PREFIX.match(str(f_url.path)) + + if match is None: + return context, None + + base_path = match.group() + + if not base_path.endswith("/"): + base_path += "/" + + f_url.path = f"{base_path}.rss" + f_url.query = None + + return context.replace(candidate_urls={str(f_url)}), None + + + # Note: we do not add internal feed discoverer here (like db check: url -> uid -> feed_id), because # - we do not expect significant performance improvement # - internal feed data (news list) may be slightly outdated (not containing the latest news) @@ -189,6 +227,8 @@ async def _discover_stop_recursion(context: Context) -> tuple[Context, Result | _discover_load_url, _discover_extract_feed_info, _discover_stop_recursion, + _discover_extract_feeds_for_reddit, + _discover_check_candidate_links, _discover_create_soup, _discover_extract_feeds_from_links, _discover_check_candidate_links, diff --git a/ffun/ffun/feeds_discoverer/tests/test_domain.py b/ffun/ffun/feeds_discoverer/tests/test_domain.py index 53e9d19a..43910df3 100644 --- a/ffun/ffun/feeds_discoverer/tests/test_domain.py +++ b/ffun/ffun/feeds_discoverer/tests/test_domain.py @@ -15,6 +15,7 @@ _discover_load_url, _discover_stop_recursion, _discoverers, + _discover_extract_feeds_for_reddit, discover, ) from ffun.feeds_discoverer.entities import Context, Result, Status @@ -428,12 +429,52 @@ async def test_depth_not_zero(self) -> None: assert result is None +class TestDiscoverExtractFeedsForReddit: + + @pytest.mark.asyncio + async def test_not_reddit(self) -> None: + context = Context(raw_url=UnknownUrl("http://example.com/test"), + url=str_to_feed_url("http://example.com/test")) + + new_context, result = await _discover_extract_feeds_for_reddit(context) + + assert new_context == context + assert result is None + + @pytest.mark.asyncio + async def test_old_reditt(self) -> None: + context = Context(raw_url=UnknownUrl("https://old.reddit.com/r/feedsfun/"), + url=str_to_feed_url("https://old.reddit.com/r/feedsfun/")) + + new_context, result = await _discover_extract_feeds_for_reddit(context) + + assert new_context == context + assert result is None + + @pytest.mark.parametrize("url,expected_url", [("https://www.reddit.com/r/feedsfun/", "https://www.reddit.com/r/feedsfun/.rss"), + ("https://www.reddit.com/r/feedsfun/?sd=x", "https://www.reddit.com/r/feedsfun/.rss"), + ("https://www.reddit.com/r/feedsfun", "https://www.reddit.com/r/feedsfun/.rss"), + ("https://reddit.com/r/feedsfun/", "https://reddit.com/r/feedsfun/.rss"), + ("https://reddit.com/r/feedsfun", "https://reddit.com/r/feedsfun/.rss"),]) + @pytest.mark.asyncio + async def test_new_reddit(self, url: str, expected_url: FeedUrl) -> None: + context = Context(raw_url=UnknownUrl(url), + url=str_to_feed_url(url)) + + new_context, result = await _discover_extract_feeds_for_reddit(context) + + assert new_context == context.replace(candidate_urls={expected_url}) + assert result is None + + def test_discoverers_list_not_changed() -> None: assert _discoverers == [ _discover_adjust_url, _discover_load_url, _discover_extract_feed_info, _discover_stop_recursion, + _discover_extract_feeds_for_reddit, + _discover_check_candidate_links, _discover_create_soup, _discover_extract_feeds_from_links, _discover_check_candidate_links, From 856506643324520cec798e13d729f72eea8d5742 Mon Sep 17 00:00:00 2001 From: "Aliaksei Yaletski (Tiendil)" Date: Fri, 27 Dec 2024 17:46:39 +0100 Subject: [PATCH 2/3] changes --- changes/unreleased.md | 1 + 1 file changed, 1 insertion(+) diff --git a/changes/unreleased.md b/changes/unreleased.md index 258bad12..21321f1f 100644 --- a/changes/unreleased.md +++ b/changes/unreleased.md @@ -15,3 +15,4 @@ Changes: - Only a single news item now can be opened. - Performance of the News page improved 2-3 times. - ff-172 — Added feeds collection "Entrepreneurship & Startups". Also added cli commands `ffun estimates entries-per-day-for-collection` and `ffun estimates entries-per-day-for-feed` +- ff-199 — Fixed Reddit feeds discovery. From bb30b740046a0a165a4361af0d21f0ae2ec9c3c7 Mon Sep 17 00:00:00 2001 From: "Aliaksei Yaletski (Tiendil)" Date: Fri, 27 Dec 2024 17:47:58 +0100 Subject: [PATCH 3/3] formatting --- ffun/ffun/feeds_discoverer/domain.py | 5 ++- .../feeds_discoverer/tests/test_domain.py | 31 ++++++++++++------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/ffun/ffun/feeds_discoverer/domain.py b/ffun/ffun/feeds_discoverer/domain.py index 720907e9..59cd1813 100644 --- a/ffun/ffun/feeds_discoverer/domain.py +++ b/ffun/ffun/feeds_discoverer/domain.py @@ -1,18 +1,18 @@ import asyncio +import re from bs4 import BeautifulSoup -import re from ffun.core import logging from ffun.domain.entities import AbsoluteUrl, UnknownUrl from ffun.domain.urls import ( adjust_classic_url, + construct_f_url, filter_out_duplicated_urls, get_parent_url, normalize_classic_unknown_url, to_feed_url, url_has_extension, - construct_f_url, ) from ffun.feeds_discoverer.entities import Context, Discoverer, Result, Status from ffun.loader import domain as lo_domain @@ -218,7 +218,6 @@ async def _discover_extract_feeds_for_reddit(context: Context) -> tuple[Context, return context.replace(candidate_urls={str(f_url)}), None - # Note: we do not add internal feed discoverer here (like db check: url -> uid -> feed_id), because # - we do not expect significant performance improvement # - internal feed data (news list) may be slightly outdated (not containing the latest news) diff --git a/ffun/ffun/feeds_discoverer/tests/test_domain.py b/ffun/ffun/feeds_discoverer/tests/test_domain.py index 43910df3..2ff2f0a6 100644 --- a/ffun/ffun/feeds_discoverer/tests/test_domain.py +++ b/ffun/ffun/feeds_discoverer/tests/test_domain.py @@ -10,12 +10,12 @@ _discover_check_parent_urls, _discover_create_soup, _discover_extract_feed_info, + _discover_extract_feeds_for_reddit, _discover_extract_feeds_from_anchors, _discover_extract_feeds_from_links, _discover_load_url, _discover_stop_recursion, _discoverers, - _discover_extract_feeds_for_reddit, discover, ) from ffun.feeds_discoverer.entities import Context, Result, Status @@ -433,8 +433,9 @@ class TestDiscoverExtractFeedsForReddit: @pytest.mark.asyncio async def test_not_reddit(self) -> None: - context = Context(raw_url=UnknownUrl("http://example.com/test"), - url=str_to_feed_url("http://example.com/test")) + context = Context( + raw_url=UnknownUrl("http://example.com/test"), url=str_to_feed_url("http://example.com/test") + ) new_context, result = await _discover_extract_feeds_for_reddit(context) @@ -443,23 +444,29 @@ async def test_not_reddit(self) -> None: @pytest.mark.asyncio async def test_old_reditt(self) -> None: - context = Context(raw_url=UnknownUrl("https://old.reddit.com/r/feedsfun/"), - url=str_to_feed_url("https://old.reddit.com/r/feedsfun/")) + context = Context( + raw_url=UnknownUrl("https://old.reddit.com/r/feedsfun/"), + url=str_to_feed_url("https://old.reddit.com/r/feedsfun/"), + ) new_context, result = await _discover_extract_feeds_for_reddit(context) assert new_context == context assert result is None - @pytest.mark.parametrize("url,expected_url", [("https://www.reddit.com/r/feedsfun/", "https://www.reddit.com/r/feedsfun/.rss"), - ("https://www.reddit.com/r/feedsfun/?sd=x", "https://www.reddit.com/r/feedsfun/.rss"), - ("https://www.reddit.com/r/feedsfun", "https://www.reddit.com/r/feedsfun/.rss"), - ("https://reddit.com/r/feedsfun/", "https://reddit.com/r/feedsfun/.rss"), - ("https://reddit.com/r/feedsfun", "https://reddit.com/r/feedsfun/.rss"),]) + @pytest.mark.parametrize( + "url,expected_url", + [ + ("https://www.reddit.com/r/feedsfun/", "https://www.reddit.com/r/feedsfun/.rss"), + ("https://www.reddit.com/r/feedsfun/?sd=x", "https://www.reddit.com/r/feedsfun/.rss"), + ("https://www.reddit.com/r/feedsfun", "https://www.reddit.com/r/feedsfun/.rss"), + ("https://reddit.com/r/feedsfun/", "https://reddit.com/r/feedsfun/.rss"), + ("https://reddit.com/r/feedsfun", "https://reddit.com/r/feedsfun/.rss"), + ], + ) @pytest.mark.asyncio async def test_new_reddit(self, url: str, expected_url: FeedUrl) -> None: - context = Context(raw_url=UnknownUrl(url), - url=str_to_feed_url(url)) + context = Context(raw_url=UnknownUrl(url), url=str_to_feed_url(url)) new_context, result = await _discover_extract_feeds_for_reddit(context)