From 1e5672c9981f442f13890ef71963575ad464dfeb Mon Sep 17 00:00:00 2001
From: "Aliaksei Yaletski (Tiendil)" <a.eletsky@gmail.com>
Date: Fri, 27 Dec 2024 17:42:43 +0100
Subject: [PATCH 1/3] boilerplate

---
 ffun/ffun/domain/urls.py                      |  8 ++--
 ffun/ffun/feeds_discoverer/domain.py          | 40 ++++++++++++++++++
 .../feeds_discoverer/tests/test_domain.py     | 41 +++++++++++++++++++
 3 files changed, 85 insertions(+), 4 deletions(-)

diff --git a/ffun/ffun/domain/urls.py b/ffun/ffun/domain/urls.py
index ecd354f9..e04bb1d8 100644
--- a/ffun/ffun/domain/urls.py
+++ b/ffun/ffun/domain/urls.py
@@ -37,7 +37,7 @@ def check_furl_error() -> Iterator[None]:
         raise
 
 
-def _construct_f_url(url: UnknownUrl | AbsoluteUrl | str) -> furl | None:
+def construct_f_url(url: UnknownUrl | AbsoluteUrl | str) -> furl | None:
     try:
         with check_furl_error():
             return furl(url)
@@ -63,7 +63,7 @@ def _fix_classic_url_to_absolute(url: str) -> AbsoluteUrl | None:
     if tldextract.extract(domain_part).suffix == "":
         return None
 
-    f_url = _construct_f_url(f"//{url}")
+    f_url = construct_f_url(f"//{url}")
 
     if f_url is None:
         return None
@@ -76,7 +76,7 @@ def normalize_classic_unknown_url(url: UnknownUrl) -> AbsoluteUrl | None:
     url = UnknownUrl(url.strip())
 
     # check if url is parsable
-    f_url = _construct_f_url(url)
+    f_url = construct_f_url(url)
 
     if f_url is None:
         return None
@@ -145,7 +145,7 @@ def adjust_classic_full_url(url: UnknownUrl, original_url: AbsoluteUrl | FeedUrl
 
 # ATTENTION: see note at the top of the file
 def adjust_classic_relative_url(url: UnknownUrl, original_url: AbsoluteUrl | FeedUrl) -> AbsoluteUrl | None:
-    f_url = _construct_f_url(original_url)
+    f_url = construct_f_url(original_url)
 
     if f_url is None:
         return None
diff --git a/ffun/ffun/feeds_discoverer/domain.py b/ffun/ffun/feeds_discoverer/domain.py
index a09ee700..720907e9 100644
--- a/ffun/ffun/feeds_discoverer/domain.py
+++ b/ffun/ffun/feeds_discoverer/domain.py
@@ -1,6 +1,7 @@
 import asyncio
 
 from bs4 import BeautifulSoup
+import re
 
 from ffun.core import logging
 from ffun.domain.entities import AbsoluteUrl, UnknownUrl
@@ -11,6 +12,7 @@
     normalize_classic_unknown_url,
     to_feed_url,
     url_has_extension,
+    construct_f_url,
 )
 from ffun.feeds_discoverer.entities import Context, Discoverer, Result, Status
 from ffun.loader import domain as lo_domain
@@ -181,6 +183,42 @@ async def _discover_stop_recursion(context: Context) -> tuple[Context, Result |
     return context, None
 
 
+_RE_REDDIT_PATH_PREFIX = re.compile(r"^/r/[^/]+/?")
+
+
+async def _discover_extract_feeds_for_reddit(context: Context) -> tuple[Context, Result | None]:
+    """New Reddit site has no links to RSS feeds => we construct them."""
+    assert context.url is not None
+
+    f_url = construct_f_url(context.url)
+
+    assert f_url is not None
+
+    if f_url.host not in ("www.reddit.com", "reddit.com", "old.reddit.com"):
+        # We are not interested in not reddit.com domains
+        return context, None
+
+    if f_url.host == "old.reddit.com":
+        # Old Reddit site has marked RSS urls in the header
+        return context, None
+
+    match = _RE_REDDIT_PATH_PREFIX.match(str(f_url.path))
+
+    if match is None:
+        return context, None
+
+    base_path = match.group()
+
+    if not base_path.endswith("/"):
+        base_path += "/"
+
+    f_url.path = f"{base_path}.rss"
+    f_url.query = None
+
+    return context.replace(candidate_urls={str(f_url)}), None
+
+
+
 # Note: we do not add internal feed discoverer here (like db check: url -> uid -> feed_id), because
 #       - we do not expect significant performance improvement
 #       - internal feed data (news list) may be slightly outdated (not containing the latest news)
@@ -189,6 +227,8 @@ async def _discover_stop_recursion(context: Context) -> tuple[Context, Result |
     _discover_load_url,
     _discover_extract_feed_info,
     _discover_stop_recursion,
+    _discover_extract_feeds_for_reddit,
+    _discover_check_candidate_links,
     _discover_create_soup,
     _discover_extract_feeds_from_links,
     _discover_check_candidate_links,
diff --git a/ffun/ffun/feeds_discoverer/tests/test_domain.py b/ffun/ffun/feeds_discoverer/tests/test_domain.py
index 53e9d19a..43910df3 100644
--- a/ffun/ffun/feeds_discoverer/tests/test_domain.py
+++ b/ffun/ffun/feeds_discoverer/tests/test_domain.py
@@ -15,6 +15,7 @@
     _discover_load_url,
     _discover_stop_recursion,
     _discoverers,
+    _discover_extract_feeds_for_reddit,
     discover,
 )
 from ffun.feeds_discoverer.entities import Context, Result, Status
@@ -428,12 +429,52 @@ async def test_depth_not_zero(self) -> None:
         assert result is None
 
 
+class TestDiscoverExtractFeedsForReddit:
+
+    @pytest.mark.asyncio
+    async def test_not_reddit(self) -> None:
+        context = Context(raw_url=UnknownUrl("http://example.com/test"),
+                          url=str_to_feed_url("http://example.com/test"))
+
+        new_context, result = await _discover_extract_feeds_for_reddit(context)
+
+        assert new_context == context
+        assert result is None
+
+    @pytest.mark.asyncio
+    async def test_old_reditt(self) -> None:
+        context = Context(raw_url=UnknownUrl("https://old.reddit.com/r/feedsfun/"),
+                          url=str_to_feed_url("https://old.reddit.com/r/feedsfun/"))
+
+        new_context, result = await _discover_extract_feeds_for_reddit(context)
+
+        assert new_context == context
+        assert result is None
+
+    @pytest.mark.parametrize("url,expected_url", [("https://www.reddit.com/r/feedsfun/", "https://www.reddit.com/r/feedsfun/.rss"),
+                                                  ("https://www.reddit.com/r/feedsfun/?sd=x", "https://www.reddit.com/r/feedsfun/.rss"),
+                                                  ("https://www.reddit.com/r/feedsfun", "https://www.reddit.com/r/feedsfun/.rss"),
+                                                  ("https://reddit.com/r/feedsfun/", "https://reddit.com/r/feedsfun/.rss"),
+                                                  ("https://reddit.com/r/feedsfun", "https://reddit.com/r/feedsfun/.rss"),])
+    @pytest.mark.asyncio
+    async def test_new_reddit(self, url: str, expected_url: FeedUrl) -> None:
+        context = Context(raw_url=UnknownUrl(url),
+                          url=str_to_feed_url(url))
+
+        new_context, result = await _discover_extract_feeds_for_reddit(context)
+
+        assert new_context == context.replace(candidate_urls={expected_url})
+        assert result is None
+
+
 def test_discoverers_list_not_changed() -> None:
     assert _discoverers == [
         _discover_adjust_url,
         _discover_load_url,
         _discover_extract_feed_info,
         _discover_stop_recursion,
+        _discover_extract_feeds_for_reddit,
+        _discover_check_candidate_links,
         _discover_create_soup,
         _discover_extract_feeds_from_links,
         _discover_check_candidate_links,

From 856506643324520cec798e13d729f72eea8d5742 Mon Sep 17 00:00:00 2001
From: "Aliaksei Yaletski (Tiendil)" <a.eletsky@gmail.com>
Date: Fri, 27 Dec 2024 17:46:39 +0100
Subject: [PATCH 2/3] changes

---
 changes/unreleased.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/changes/unreleased.md b/changes/unreleased.md
index 258bad12..21321f1f 100644
--- a/changes/unreleased.md
+++ b/changes/unreleased.md
@@ -15,3 +15,4 @@ Changes:
   - Only a single news item now can be opened.
   - Performance of the News page improved 2-3 times.
 - ff-172 — Added feeds collection "Entrepreneurship & Startups". Also added cli commands `ffun estimates entries-per-day-for-collection` and `ffun estimates entries-per-day-for-feed`
+- ff-199 — Fixed Reddit feeds discovery.

From bb30b740046a0a165a4361af0d21f0ae2ec9c3c7 Mon Sep 17 00:00:00 2001
From: "Aliaksei Yaletski (Tiendil)" <a.eletsky@gmail.com>
Date: Fri, 27 Dec 2024 17:47:58 +0100
Subject: [PATCH 3/3] formatting

---
 ffun/ffun/feeds_discoverer/domain.py          |  5 ++-
 .../feeds_discoverer/tests/test_domain.py     | 31 ++++++++++++-------
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/ffun/ffun/feeds_discoverer/domain.py b/ffun/ffun/feeds_discoverer/domain.py
index 720907e9..59cd1813 100644
--- a/ffun/ffun/feeds_discoverer/domain.py
+++ b/ffun/ffun/feeds_discoverer/domain.py
@@ -1,18 +1,18 @@
 import asyncio
+import re
 
 from bs4 import BeautifulSoup
-import re
 
 from ffun.core import logging
 from ffun.domain.entities import AbsoluteUrl, UnknownUrl
 from ffun.domain.urls import (
     adjust_classic_url,
+    construct_f_url,
     filter_out_duplicated_urls,
     get_parent_url,
     normalize_classic_unknown_url,
     to_feed_url,
     url_has_extension,
-    construct_f_url,
 )
 from ffun.feeds_discoverer.entities import Context, Discoverer, Result, Status
 from ffun.loader import domain as lo_domain
@@ -218,7 +218,6 @@ async def _discover_extract_feeds_for_reddit(context: Context) -> tuple[Context,
     return context.replace(candidate_urls={str(f_url)}), None
 
 
-
 # Note: we do not add internal feed discoverer here (like db check: url -> uid -> feed_id), because
 #       - we do not expect significant performance improvement
 #       - internal feed data (news list) may be slightly outdated (not containing the latest news)
diff --git a/ffun/ffun/feeds_discoverer/tests/test_domain.py b/ffun/ffun/feeds_discoverer/tests/test_domain.py
index 43910df3..2ff2f0a6 100644
--- a/ffun/ffun/feeds_discoverer/tests/test_domain.py
+++ b/ffun/ffun/feeds_discoverer/tests/test_domain.py
@@ -10,12 +10,12 @@
     _discover_check_parent_urls,
     _discover_create_soup,
     _discover_extract_feed_info,
+    _discover_extract_feeds_for_reddit,
     _discover_extract_feeds_from_anchors,
     _discover_extract_feeds_from_links,
     _discover_load_url,
     _discover_stop_recursion,
     _discoverers,
-    _discover_extract_feeds_for_reddit,
     discover,
 )
 from ffun.feeds_discoverer.entities import Context, Result, Status
@@ -433,8 +433,9 @@ class TestDiscoverExtractFeedsForReddit:
 
     @pytest.mark.asyncio
     async def test_not_reddit(self) -> None:
-        context = Context(raw_url=UnknownUrl("http://example.com/test"),
-                          url=str_to_feed_url("http://example.com/test"))
+        context = Context(
+            raw_url=UnknownUrl("http://example.com/test"), url=str_to_feed_url("http://example.com/test")
+        )
 
         new_context, result = await _discover_extract_feeds_for_reddit(context)
 
@@ -443,23 +444,29 @@ async def test_not_reddit(self) -> None:
 
     @pytest.mark.asyncio
     async def test_old_reditt(self) -> None:
-        context = Context(raw_url=UnknownUrl("https://old.reddit.com/r/feedsfun/"),
-                          url=str_to_feed_url("https://old.reddit.com/r/feedsfun/"))
+        context = Context(
+            raw_url=UnknownUrl("https://old.reddit.com/r/feedsfun/"),
+            url=str_to_feed_url("https://old.reddit.com/r/feedsfun/"),
+        )
 
         new_context, result = await _discover_extract_feeds_for_reddit(context)
 
         assert new_context == context
         assert result is None
 
-    @pytest.mark.parametrize("url,expected_url", [("https://www.reddit.com/r/feedsfun/", "https://www.reddit.com/r/feedsfun/.rss"),
-                                                  ("https://www.reddit.com/r/feedsfun/?sd=x", "https://www.reddit.com/r/feedsfun/.rss"),
-                                                  ("https://www.reddit.com/r/feedsfun", "https://www.reddit.com/r/feedsfun/.rss"),
-                                                  ("https://reddit.com/r/feedsfun/", "https://reddit.com/r/feedsfun/.rss"),
-                                                  ("https://reddit.com/r/feedsfun", "https://reddit.com/r/feedsfun/.rss"),])
+    @pytest.mark.parametrize(
+        "url,expected_url",
+        [
+            ("https://www.reddit.com/r/feedsfun/", "https://www.reddit.com/r/feedsfun/.rss"),
+            ("https://www.reddit.com/r/feedsfun/?sd=x", "https://www.reddit.com/r/feedsfun/.rss"),
+            ("https://www.reddit.com/r/feedsfun", "https://www.reddit.com/r/feedsfun/.rss"),
+            ("https://reddit.com/r/feedsfun/", "https://reddit.com/r/feedsfun/.rss"),
+            ("https://reddit.com/r/feedsfun", "https://reddit.com/r/feedsfun/.rss"),
+        ],
+    )
     @pytest.mark.asyncio
     async def test_new_reddit(self, url: str, expected_url: FeedUrl) -> None:
-        context = Context(raw_url=UnknownUrl(url),
-                          url=str_to_feed_url(url))
+        context = Context(raw_url=UnknownUrl(url), url=str_to_feed_url(url))
 
         new_context, result = await _discover_extract_feeds_for_reddit(context)