Tiendil · Tiendil · Dec 27, 2024 · Dec 27, 2024 · Dec 27, 2024 · Dec 27, 2024
diff --git a/changes/unreleased.md b/changes/unreleased.md
@@ -15,3 +15,4 @@ Changes:
   - Only a single news item now can be opened.
   - Performance of the News page improved 2-3 times.
 - ff-172 — Added feeds collection "Entrepreneurship & Startups". Also added cli commands `ffun estimates entries-per-day-for-collection` and `ffun estimates entries-per-day-for-feed`
+- ff-199 — Fixed Reddit feeds discovery.
diff --git a/ffun/ffun/domain/urls.py b/ffun/ffun/domain/urls.py
@@ -37,7 +37,7 @@ def check_furl_error() -> Iterator[None]:
         raise
 
 
-def _construct_f_url(url: UnknownUrl | AbsoluteUrl | str) -> furl | None:
+def construct_f_url(url: UnknownUrl | AbsoluteUrl | str) -> furl | None:
     try:
         with check_furl_error():
             return furl(url)
@@ -63,7 +63,7 @@ def _fix_classic_url_to_absolute(url: str) -> AbsoluteUrl | None:
     if tldextract.extract(domain_part).suffix == "":
         return None
 
-    f_url = _construct_f_url(f"//{url}")
+    f_url = construct_f_url(f"//{url}")
 
     if f_url is None:
         return None
@@ -76,7 +76,7 @@ def normalize_classic_unknown_url(url: UnknownUrl) -> AbsoluteUrl | None:
     url = UnknownUrl(url.strip())
 
     # check if url is parsable
-    f_url = _construct_f_url(url)
+    f_url = construct_f_url(url)
 
     if f_url is None:
         return None
@@ -145,7 +145,7 @@ def adjust_classic_full_url(url: UnknownUrl, original_url: AbsoluteUrl | FeedUrl
 
 # ATTENTION: see note at the top of the file
 def adjust_classic_relative_url(url: UnknownUrl, original_url: AbsoluteUrl | FeedUrl) -> AbsoluteUrl | None:
-    f_url = _construct_f_url(original_url)
+    f_url = construct_f_url(original_url)
 
     if f_url is None:
         return None

diff --git a/ffun/ffun/feeds_discoverer/domain.py b/ffun/ffun/feeds_discoverer/domain.py
@@ -1,11 +1,13 @@
 import asyncio
+import re
 
 from bs4 import BeautifulSoup
 
 from ffun.core import logging
 from ffun.domain.entities import AbsoluteUrl, UnknownUrl
 from ffun.domain.urls import (
     adjust_classic_url,
+    construct_f_url,
     filter_out_duplicated_urls,
     get_parent_url,
     normalize_classic_unknown_url,
@@ -181,6 +183,41 @@ async def _discover_stop_recursion(context: Context) -> tuple[Context, Result |
     return context, None
 
 
+_RE_REDDIT_PATH_PREFIX = re.compile(r"^/r/[^/]+/?")
+
+
+async def _discover_extract_feeds_for_reddit(context: Context) -> tuple[Context, Result | None]:
+    """New Reddit site has no links to RSS feeds => we construct them."""
+    assert context.url is not None
+
+    f_url = construct_f_url(context.url)
+
+    assert f_url is not None
+
+    if f_url.host not in ("www.reddit.com", "reddit.com", "old.reddit.com"):
+        # We are not interested in not reddit.com domains
+        return context, None
+
+    if f_url.host == "old.reddit.com":
+        # Old Reddit site has marked RSS urls in the header
+        return context, None
+
+    match = _RE_REDDIT_PATH_PREFIX.match(str(f_url.path))
+
+    if match is None:
+        return context, None
+
+    base_path = match.group()
+
+    if not base_path.endswith("/"):
+        base_path += "/"
+
+    f_url.path = f"{base_path}.rss"
+    f_url.query = None
+
+    return context.replace(candidate_urls={str(f_url)}), None
+
+
 # Note: we do not add internal feed discoverer here (like db check: url -> uid -> feed_id), because
 #       - we do not expect significant performance improvement
 #       - internal feed data (news list) may be slightly outdated (not containing the latest news)
@@ -189,6 +226,8 @@ async def _discover_stop_recursion(context: Context) -> tuple[Context, Result |
     _discover_load_url,
     _discover_extract_feed_info,
     _discover_stop_recursion,
+    _discover_extract_feeds_for_reddit,
+    _discover_check_candidate_links,
     _discover_create_soup,
     _discover_extract_feeds_from_links,
     _discover_check_candidate_links,

diff --git a/ffun/ffun/feeds_discoverer/tests/test_domain.py b/ffun/ffun/feeds_discoverer/tests/test_domain.py
@@ -10,6 +10,7 @@
     _discover_check_parent_urls,
     _discover_create_soup,
     _discover_extract_feed_info,
+    _discover_extract_feeds_for_reddit,
     _discover_extract_feeds_from_anchors,
     _discover_extract_feeds_from_links,
     _discover_load_url,
@@ -428,12 +429,59 @@ async def test_depth_not_zero(self) -> None:
         assert result is None
 
 
+class TestDiscoverExtractFeedsForReddit:
+
+    @pytest.mark.asyncio
+    async def test_not_reddit(self) -> None:
+        context = Context(
+            raw_url=UnknownUrl("http://example.com/test"), url=str_to_feed_url("http://example.com/test")
+        )
+
+        new_context, result = await _discover_extract_feeds_for_reddit(context)
+
+        assert new_context == context
+        assert result is None
+
+    @pytest.mark.asyncio
+    async def test_old_reditt(self) -> None:
+        context = Context(
+            raw_url=UnknownUrl("https://old.reddit.com/r/feedsfun/"),
+            url=str_to_feed_url("https://old.reddit.com/r/feedsfun/"),
+        )
+
+        new_context, result = await _discover_extract_feeds_for_reddit(context)
+
+        assert new_context == context
+        assert result is None
+
+    @pytest.mark.parametrize(
+        "url,expected_url",
+        [
+            ("https://www.reddit.com/r/feedsfun/", "https://www.reddit.com/r/feedsfun/.rss"),
+            ("https://www.reddit.com/r/feedsfun/?sd=x", "https://www.reddit.com/r/feedsfun/.rss"),
+            ("https://www.reddit.com/r/feedsfun", "https://www.reddit.com/r/feedsfun/.rss"),
+            ("https://reddit.com/r/feedsfun/", "https://reddit.com/r/feedsfun/.rss"),
+            ("https://reddit.com/r/feedsfun", "https://reddit.com/r/feedsfun/.rss"),
+        ],
+    )
+    @pytest.mark.asyncio
+    async def test_new_reddit(self, url: str, expected_url: FeedUrl) -> None:
+        context = Context(raw_url=UnknownUrl(url), url=str_to_feed_url(url))
+
+        new_context, result = await _discover_extract_feeds_for_reddit(context)
+
+        assert new_context == context.replace(candidate_urls={expected_url})
+        assert result is None
+
+
 def test_discoverers_list_not_changed() -> None:
     assert _discoverers == [
         _discover_adjust_url,
         _discover_load_url,
         _discover_extract_feed_info,
         _discover_stop_recursion,
+        _discover_extract_feeds_for_reddit,
+        _discover_check_candidate_links,
         _discover_create_soup,
         _discover_extract_feeds_from_links,
         _discover_check_candidate_links,