apify · Mantisus · Oct 17, 2025
diff --git a/src/crawlee/_utils/urls.py b/src/crawlee/_utils/urls.py
@@ -7,6 +7,7 @@
 
 if TYPE_CHECKING:
     from collections.abc import Iterator
+    from logging import Logger
 
 
 def is_url_absolute(url: str) -> bool:
@@ -22,13 +23,19 @@ def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
     return str(URL(base_url).join(URL(relative_url)))
 
 
-def to_absolute_url_iterator(base_url: str, urls: Iterator[str]) -> Iterator[str]:
+def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:
     """Convert an iterator of relative URLs to absolute URLs using a base URL."""
     for url in urls:
         if is_url_absolute(url):
             yield url
         else:
-            yield convert_to_absolute_url(base_url, url)
+            converted_url = convert_to_absolute_url(base_url, url)
+            # Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.
+            if not is_url_absolute(converted_url):
+                if logger:
+                    logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.')
+                continue
+            yield converted_url
 
 
 _http_url_adapter = TypeAdapter(AnyHttpUrl)

diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
@@ -167,7 +167,9 @@ async def extract_links(
             kwargs.setdefault('strategy', 'same-hostname')
 
             links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
-            links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
+            links_iterator = to_absolute_url_iterator(
+                context.request.loaded_url or context.request.url, links_iterator, logger=context.log
+            )
 
             if robots_txt_file:
                 skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)

diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -366,7 +366,9 @@ async def extract_links(
             links_iterator: Iterator[str] = iter(
                 [url for element in elements if (url := await element.get_attribute('href')) is not None]
             )
-            links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
+            links_iterator = to_absolute_url_iterator(
+                context.request.loaded_url or context.request.url, links_iterator, logger=context.log
+            )
 
             if robots_txt_file:
                 skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)

diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
@@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
     assert handler.called
 
     # The handler should find two links
-    assert len(handler.call_args[0][0]) == 2
+    assert len(handler.call_args[0][0]) == 3
 
 
 async def test_enqueue_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:

diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py
@@ -32,7 +32,7 @@ async def request_handler(context: ParselCrawlingContext) -> None:
     assert handler.called
 
     # The handler should find two links
-    assert len(handler.call_args[0][0]) == 2
+    assert len(handler.call_args[0][0]) == 3
 
 
 async def test_enqueue_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:

diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py
@@ -14,6 +14,7 @@
 <body>
     <a href="/sub_index" class="foo">Link 1</a>
     <a href="/page_1">Link 2</a>
+    <a href="mailto:[email protected]">[email protected]</a>
 </body></html>"""
 
 SECONDARY_INDEX = b"""\