diff --git a/changes/unreleased.md b/changes/unreleased.md index 258bad12..21321f1f 100644 --- a/changes/unreleased.md +++ b/changes/unreleased.md @@ -15,3 +15,4 @@ Changes: - Only a single news item now can be opened. - Performance of the News page improved 2-3 times. - ff-172 — Added feeds collection "Entrepreneurship & Startups". Also added cli commands `ffun estimates entries-per-day-for-collection` and `ffun estimates entries-per-day-for-feed` +- ff-199 — Fixed Reddit feeds discovery. diff --git a/ffun/ffun/domain/urls.py b/ffun/ffun/domain/urls.py index ecd354f9..e04bb1d8 100644 --- a/ffun/ffun/domain/urls.py +++ b/ffun/ffun/domain/urls.py @@ -37,7 +37,7 @@ def check_furl_error() -> Iterator[None]: raise -def _construct_f_url(url: UnknownUrl | AbsoluteUrl | str) -> furl | None: +def construct_f_url(url: UnknownUrl | AbsoluteUrl | str) -> furl | None: try: with check_furl_error(): return furl(url) @@ -63,7 +63,7 @@ def _fix_classic_url_to_absolute(url: str) -> AbsoluteUrl | None: if tldextract.extract(domain_part).suffix == "": return None - f_url = _construct_f_url(f"//{url}") + f_url = construct_f_url(f"//{url}") if f_url is None: return None @@ -76,7 +76,7 @@ def normalize_classic_unknown_url(url: UnknownUrl) -> AbsoluteUrl | None: url = UnknownUrl(url.strip()) # check if url is parsable - f_url = _construct_f_url(url) + f_url = construct_f_url(url) if f_url is None: return None @@ -145,7 +145,7 @@ def adjust_classic_full_url(url: UnknownUrl, original_url: AbsoluteUrl | FeedUrl # ATTENTION: see note at the top of the file def adjust_classic_relative_url(url: UnknownUrl, original_url: AbsoluteUrl | FeedUrl) -> AbsoluteUrl | None: - f_url = _construct_f_url(original_url) + f_url = construct_f_url(original_url) if f_url is None: return None diff --git a/ffun/ffun/feeds_discoverer/domain.py b/ffun/ffun/feeds_discoverer/domain.py index a09ee700..59cd1813 100644 --- a/ffun/ffun/feeds_discoverer/domain.py +++ b/ffun/ffun/feeds_discoverer/domain.py @@ -1,4 +1,5 @@ import asyncio +import re from bs4 import BeautifulSoup @@ -6,6 +7,7 @@ from ffun.domain.entities import AbsoluteUrl, UnknownUrl from ffun.domain.urls import ( adjust_classic_url, + construct_f_url, filter_out_duplicated_urls, get_parent_url, normalize_classic_unknown_url, @@ -181,6 +183,41 @@ async def _discover_stop_recursion(context: Context) -> tuple[Context, Result | return context, None +_RE_REDDIT_PATH_PREFIX = re.compile(r"^/r/[^/]+/?") + + +async def _discover_extract_feeds_for_reddit(context: Context) -> tuple[Context, Result | None]: + """New Reddit site has no links to RSS feeds => we construct them.""" + assert context.url is not None + + f_url = construct_f_url(context.url) + + assert f_url is not None + + if f_url.host not in ("www.reddit.com", "reddit.com", "old.reddit.com"): + # We are not interested in not reddit.com domains + return context, None + + if f_url.host == "old.reddit.com": + # Old Reddit site has marked RSS urls in the header + return context, None + + match = _RE_REDDIT_PATH_PREFIX.match(str(f_url.path)) + + if match is None: + return context, None + + base_path = match.group() + + if not base_path.endswith("/"): + base_path += "/" + + f_url.path = f"{base_path}.rss" + f_url.query = None + + return context.replace(candidate_urls={str(f_url)}), None + + # Note: we do not add internal feed discoverer here (like db check: url -> uid -> feed_id), because # - we do not expect significant performance improvement # - internal feed data (news list) may be slightly outdated (not containing the latest news) @@ -189,6 +226,8 @@ async def _discover_stop_recursion(context: Context) -> tuple[Context, Result | _discover_load_url, _discover_extract_feed_info, _discover_stop_recursion, + _discover_extract_feeds_for_reddit, + _discover_check_candidate_links, _discover_create_soup, _discover_extract_feeds_from_links, _discover_check_candidate_links, diff --git a/ffun/ffun/feeds_discoverer/tests/test_domain.py b/ffun/ffun/feeds_discoverer/tests/test_domain.py index 53e9d19a..2ff2f0a6 100644 --- a/ffun/ffun/feeds_discoverer/tests/test_domain.py +++ b/ffun/ffun/feeds_discoverer/tests/test_domain.py @@ -10,6 +10,7 @@ _discover_check_parent_urls, _discover_create_soup, _discover_extract_feed_info, + _discover_extract_feeds_for_reddit, _discover_extract_feeds_from_anchors, _discover_extract_feeds_from_links, _discover_load_url, @@ -428,12 +429,59 @@ async def test_depth_not_zero(self) -> None: assert result is None +class TestDiscoverExtractFeedsForReddit: + + @pytest.mark.asyncio + async def test_not_reddit(self) -> None: + context = Context( + raw_url=UnknownUrl("http://example.com/test"), url=str_to_feed_url("http://example.com/test") + ) + + new_context, result = await _discover_extract_feeds_for_reddit(context) + + assert new_context == context + assert result is None + + @pytest.mark.asyncio + async def test_old_reditt(self) -> None: + context = Context( + raw_url=UnknownUrl("https://old.reddit.com/r/feedsfun/"), + url=str_to_feed_url("https://old.reddit.com/r/feedsfun/"), + ) + + new_context, result = await _discover_extract_feeds_for_reddit(context) + + assert new_context == context + assert result is None + + @pytest.mark.parametrize( + "url,expected_url", + [ + ("https://www.reddit.com/r/feedsfun/", "https://www.reddit.com/r/feedsfun/.rss"), + ("https://www.reddit.com/r/feedsfun/?sd=x", "https://www.reddit.com/r/feedsfun/.rss"), + ("https://www.reddit.com/r/feedsfun", "https://www.reddit.com/r/feedsfun/.rss"), + ("https://reddit.com/r/feedsfun/", "https://reddit.com/r/feedsfun/.rss"), + ("https://reddit.com/r/feedsfun", "https://reddit.com/r/feedsfun/.rss"), + ], + ) + @pytest.mark.asyncio + async def test_new_reddit(self, url: str, expected_url: FeedUrl) -> None: + context = Context(raw_url=UnknownUrl(url), url=str_to_feed_url(url)) + + new_context, result = await _discover_extract_feeds_for_reddit(context) + + assert new_context == context.replace(candidate_urls={expected_url}) + assert result is None + + def test_discoverers_list_not_changed() -> None: assert _discoverers == [ _discover_adjust_url, _discover_load_url, _discover_extract_feed_info, _discover_stop_recursion, + _discover_extract_feeds_for_reddit, + _discover_check_candidate_links, _discover_create_soup, _discover_extract_feeds_from_links, _discover_check_candidate_links,