Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ff-199 Fixed Reddit feeds discovery #308

Merged
merged 3 commits into from
Dec 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/unreleased.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ Changes:
- Only a single news item now can be opened.
- Performance of the News page improved 2-3 times.
- ff-172 — Added feeds collection "Entrepreneurship & Startups". Also added cli commands `ffun estimates entries-per-day-for-collection` and `ffun estimates entries-per-day-for-feed`
- ff-199 — Fixed Reddit feeds discovery.
8 changes: 4 additions & 4 deletions ffun/ffun/domain/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def check_furl_error() -> Iterator[None]:
raise


def _construct_f_url(url: UnknownUrl | AbsoluteUrl | str) -> furl | None:
def construct_f_url(url: UnknownUrl | AbsoluteUrl | str) -> furl | None:
try:
with check_furl_error():
return furl(url)
Expand All @@ -63,7 +63,7 @@ def _fix_classic_url_to_absolute(url: str) -> AbsoluteUrl | None:
if tldextract.extract(domain_part).suffix == "":
return None

f_url = _construct_f_url(f"//{url}")
f_url = construct_f_url(f"//{url}")

if f_url is None:
return None
Expand All @@ -76,7 +76,7 @@ def normalize_classic_unknown_url(url: UnknownUrl) -> AbsoluteUrl | None:
url = UnknownUrl(url.strip())

# check if url is parsable
f_url = _construct_f_url(url)
f_url = construct_f_url(url)

if f_url is None:
return None
Expand Down Expand Up @@ -145,7 +145,7 @@ def adjust_classic_full_url(url: UnknownUrl, original_url: AbsoluteUrl | FeedUrl

# ATTENTION: see note at the top of the file
def adjust_classic_relative_url(url: UnknownUrl, original_url: AbsoluteUrl | FeedUrl) -> AbsoluteUrl | None:
f_url = _construct_f_url(original_url)
f_url = construct_f_url(original_url)

if f_url is None:
return None
Expand Down
39 changes: 39 additions & 0 deletions ffun/ffun/feeds_discoverer/domain.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import asyncio
import re

from bs4 import BeautifulSoup

from ffun.core import logging
from ffun.domain.entities import AbsoluteUrl, UnknownUrl
from ffun.domain.urls import (
adjust_classic_url,
construct_f_url,
filter_out_duplicated_urls,
get_parent_url,
normalize_classic_unknown_url,
Expand Down Expand Up @@ -181,6 +183,41 @@ async def _discover_stop_recursion(context: Context) -> tuple[Context, Result |
return context, None


_RE_REDDIT_PATH_PREFIX = re.compile(r"^/r/[^/]+/?")


async def _discover_extract_feeds_for_reddit(context: Context) -> tuple[Context, Result | None]:
"""New Reddit site has no links to RSS feeds => we construct them."""
assert context.url is not None

f_url = construct_f_url(context.url)

assert f_url is not None

if f_url.host not in ("www.reddit.com", "reddit.com", "old.reddit.com"):
# We are not interested in not reddit.com domains
return context, None

if f_url.host == "old.reddit.com":
# Old Reddit site has marked RSS urls in the header
return context, None

match = _RE_REDDIT_PATH_PREFIX.match(str(f_url.path))

if match is None:
return context, None

base_path = match.group()

if not base_path.endswith("/"):
base_path += "/"

f_url.path = f"{base_path}.rss"
f_url.query = None

return context.replace(candidate_urls={str(f_url)}), None


# Note: we do not add internal feed discoverer here (like db check: url -> uid -> feed_id), because
# - we do not expect significant performance improvement
# - internal feed data (news list) may be slightly outdated (not containing the latest news)
Expand All @@ -189,6 +226,8 @@ async def _discover_stop_recursion(context: Context) -> tuple[Context, Result |
_discover_load_url,
_discover_extract_feed_info,
_discover_stop_recursion,
_discover_extract_feeds_for_reddit,
_discover_check_candidate_links,
_discover_create_soup,
_discover_extract_feeds_from_links,
_discover_check_candidate_links,
Expand Down
48 changes: 48 additions & 0 deletions ffun/ffun/feeds_discoverer/tests/test_domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
_discover_check_parent_urls,
_discover_create_soup,
_discover_extract_feed_info,
_discover_extract_feeds_for_reddit,
_discover_extract_feeds_from_anchors,
_discover_extract_feeds_from_links,
_discover_load_url,
Expand Down Expand Up @@ -428,12 +429,59 @@ async def test_depth_not_zero(self) -> None:
assert result is None


class TestDiscoverExtractFeedsForReddit:

@pytest.mark.asyncio
async def test_not_reddit(self) -> None:
context = Context(
raw_url=UnknownUrl("http://example.com/test"), url=str_to_feed_url("http://example.com/test")
)

new_context, result = await _discover_extract_feeds_for_reddit(context)

assert new_context == context
assert result is None

@pytest.mark.asyncio
async def test_old_reditt(self) -> None:
context = Context(
raw_url=UnknownUrl("https://old.reddit.com/r/feedsfun/"),
url=str_to_feed_url("https://old.reddit.com/r/feedsfun/"),
)

new_context, result = await _discover_extract_feeds_for_reddit(context)

assert new_context == context
assert result is None

@pytest.mark.parametrize(
"url,expected_url",
[
("https://www.reddit.com/r/feedsfun/", "https://www.reddit.com/r/feedsfun/.rss"),
("https://www.reddit.com/r/feedsfun/?sd=x", "https://www.reddit.com/r/feedsfun/.rss"),
("https://www.reddit.com/r/feedsfun", "https://www.reddit.com/r/feedsfun/.rss"),
("https://reddit.com/r/feedsfun/", "https://reddit.com/r/feedsfun/.rss"),
("https://reddit.com/r/feedsfun", "https://reddit.com/r/feedsfun/.rss"),
],
)
@pytest.mark.asyncio
async def test_new_reddit(self, url: str, expected_url: FeedUrl) -> None:
context = Context(raw_url=UnknownUrl(url), url=str_to_feed_url(url))

new_context, result = await _discover_extract_feeds_for_reddit(context)

assert new_context == context.replace(candidate_urls={expected_url})
assert result is None


def test_discoverers_list_not_changed() -> None:
assert _discoverers == [
_discover_adjust_url,
_discover_load_url,
_discover_extract_feed_info,
_discover_stop_recursion,
_discover_extract_feeds_for_reddit,
_discover_check_candidate_links,
_discover_create_soup,
_discover_extract_feeds_from_links,
_discover_check_candidate_links,
Expand Down