diff --git a/src/crawlee/_utils/urls.py b/src/crawlee/_utils/urls.py index 6bbb796658..0bc5a051c7 100644 --- a/src/crawlee/_utils/urls.py +++ b/src/crawlee/_utils/urls.py @@ -7,6 +7,7 @@ if TYPE_CHECKING: from collections.abc import Iterator + from logging import Logger def is_url_absolute(url: str) -> bool: @@ -22,13 +23,19 @@ def convert_to_absolute_url(base_url: str, relative_url: str) -> str: return str(URL(base_url).join(URL(relative_url))) -def to_absolute_url_iterator(base_url: str, urls: Iterator[str]) -> Iterator[str]: +def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]: """Convert an iterator of relative URLs to absolute URLs using a base URL.""" for url in urls: if is_url_absolute(url): yield url else: - yield convert_to_absolute_url(base_url, url) + converted_url = convert_to_absolute_url(base_url, url) + # Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'. + if not is_url_absolute(converted_url): + if logger: + logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.') + continue + yield converted_url _http_url_adapter = TypeAdapter(AnyHttpUrl) diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index dec4da5c45..b2678149df 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -167,7 +167,9 @@ async def extract_links( kwargs.setdefault('strategy', 'same-hostname') links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector)) - links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator) + links_iterator = to_absolute_url_iterator( + context.request.loaded_url or context.request.url, links_iterator, logger=context.log + ) if robots_txt_file: skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator) diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index c32a9c9f27..1c9a82d993 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -366,7 +366,9 @@ async def extract_links( links_iterator: Iterator[str] = iter( [url for element in elements if (url := await element.get_attribute('href')) is not None] ) - links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator) + links_iterator = to_absolute_url_iterator( + context.request.loaded_url or context.request.url, links_iterator, logger=context.log + ) if robots_txt_file: skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator) diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index ff216a849d..15a047d725 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: assert handler.called # The handler should find two links - assert len(handler.call_args[0][0]) == 2 + assert len(handler.call_args[0][0]) == 3 async def test_enqueue_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None: diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index e733296189..d6eed81083 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -32,7 +32,7 @@ async def request_handler(context: ParselCrawlingContext) -> None: assert handler.called # The handler should find two links - assert len(handler.call_args[0][0]) == 2 + assert len(handler.call_args[0][0]) == 3 async def test_enqueue_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None: diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py index a9f48e6e47..6070e046d0 100644 --- a/tests/unit/server_endpoints.py +++ b/tests/unit/server_endpoints.py @@ -14,6 +14,7 @@ Link 1 Link 2 + test@test.com """ SECONDARY_INDEX = b"""\