Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions src/crawlee/_utils/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

if TYPE_CHECKING:
from collections.abc import Iterator
from logging import Logger


def is_url_absolute(url: str) -> bool:
Expand All @@ -22,13 +23,19 @@ def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
return str(URL(base_url).join(URL(relative_url)))


def to_absolute_url_iterator(base_url: str, urls: Iterator[str]) -> Iterator[str]:
def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:
"""Convert an iterator of relative URLs to absolute URLs using a base URL."""
for url in urls:
if is_url_absolute(url):
yield url
else:
yield convert_to_absolute_url(base_url, url)
converted_url = convert_to_absolute_url(base_url, url)
# Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.
if not is_url_absolute(converted_url):
if logger:
logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.')
continue
yield converted_url


_http_url_adapter = TypeAdapter(AnyHttpUrl)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,9 @@ async def extract_links(
kwargs.setdefault('strategy', 'same-hostname')

links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
links_iterator = to_absolute_url_iterator(
context.request.loaded_url or context.request.url, links_iterator, logger=context.log
)

if robots_txt_file:
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
Expand Down
4 changes: 3 additions & 1 deletion src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,9 @@ async def extract_links(
links_iterator: Iterator[str] = iter(
[url for element in elements if (url := await element.get_attribute('href')) is not None]
)
links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
links_iterator = to_absolute_url_iterator(
context.request.loaded_url or context.request.url, links_iterator, logger=context.log
)

if robots_txt_file:
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
assert handler.called

# The handler should find two links
assert len(handler.call_args[0][0]) == 2
assert len(handler.call_args[0][0]) == 3


async def test_enqueue_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/crawlers/_parsel/test_parsel_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ async def request_handler(context: ParselCrawlingContext) -> None:
assert handler.called

# The handler should find two links
assert len(handler.call_args[0][0]) == 2
assert len(handler.call_args[0][0]) == 3


async def test_enqueue_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:
Expand Down
1 change: 1 addition & 0 deletions tests/unit/server_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
<body>
<a href="/sub_index" class="foo">Link 1</a>
<a href="/page_1">Link 2</a>
<a href="mailto:[email protected]">[email protected]</a>
</body></html>"""

SECONDARY_INDEX = b"""\
Expand Down
Loading