Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/crawlee/crawlers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from crawlee._utils.try_import import install_import_hook as _install_import_hook
from crawlee._utils.try_import import try_import as _try_import

from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult

Expand Down Expand Up @@ -51,6 +51,7 @@
'BeautifulSoupParserType',
'ContextPipeline',
'HttpCrawler',
'HttpCrawlerOptions',
'HttpCrawlingContext',
'HttpCrawlingResult',
'ParsedHttpCrawlingContext',
Expand Down
3 changes: 2 additions & 1 deletion src/crawlee/crawlers/_abstract_http/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from ._abstract_http_crawler import AbstractHttpCrawler
from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
from ._abstract_http_parser import AbstractHttpParser
from ._http_crawling_context import ParsedHttpCrawlingContext

__all__ = [
'AbstractHttpCrawler',
'AbstractHttpParser',
'HttpCrawlerOptions',
'ParsedHttpCrawlingContext',
]
19 changes: 18 additions & 1 deletion src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
import asyncio
import logging
from abc import ABC
from datetime import timedelta
from typing import TYPE_CHECKING, Any, Generic

from more_itertools import partition
from pydantic import ValidationError
from typing_extensions import TypeVar
from typing_extensions import NotRequired, TypeVar

from crawlee._request import Request, RequestOptions
from crawlee._utils.docs import docs_group
Expand All @@ -32,6 +33,19 @@
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)


class HttpCrawlerOptions(
BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
Generic[TCrawlingContext, TStatisticsState],
):
"""Arguments for the `AbstractHttpCrawler` constructor.

It is intended for typing forwarded `__init__` arguments in the subclasses.
"""

navigation_timeout: NotRequired[timedelta | None]
"""Timeout for the HTTP request."""


@docs_group('Crawlers')
class AbstractHttpCrawler(
BasicCrawler[TCrawlingContext, StatisticsState],
Expand All @@ -56,9 +70,11 @@ def __init__(
self,
*,
parser: AbstractHttpParser[TParseResult, TSelectResult],
navigation_timeout: timedelta | None = None,
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
) -> None:
self._parser = parser
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []

if '_context_pipeline' not in kwargs:
Expand Down Expand Up @@ -221,6 +237,7 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
session=context.session,
proxy_info=context.proxy_info,
statistics=self._statistics,
timeout=self._navigation_timeout,
)

yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
Expand Down
15 changes: 9 additions & 6 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1507,12 +1507,15 @@ async def __run_task_function(self) -> None:
raise

async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
await wait_for(
Copy link
Collaborator

@Pijukatel Pijukatel Nov 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am just wondering. Can the context pipeline now get stuck?

navigation_timeout(some parts of context pipeline) -> time unlimited parts of context pipeline -> request_handler_timeout(request_hanlder)

Copy link
Collaborator Author

@janbuchar janbuchar Nov 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Realistically, that has always been the case, but yeah, this increases the odds by a bit. The hooks (discussed in #1474 (comment)) are probably the biggest risk.

Perhaps we could add some comically large timeout to the context pipeline execution as a whole, too.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In that case, it makes even more sense to include the hooks in the timeout.

Other steps do not appear to be a problem in our pipelines for now.

lambda: self._context_pipeline(context, self.router),
timeout=self._request_handler_timeout,
timeout_message=f'{self._request_handler_timeout_text}'
f' {self._request_handler_timeout.total_seconds()} seconds',
logger=self._logger,
await self._context_pipeline(
context,
lambda final_context: wait_for(
lambda: self.router(final_context),
timeout=self._request_handler_timeout,
timeout_message=f'{self._request_handler_timeout_text}'
f' {self._request_handler_timeout.total_seconds()} seconds',
logger=self._logger,
),
)

def _raise_for_error_status_code(self, status_code: int) -> None:
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from bs4 import BeautifulSoup, Tag

from crawlee._utils.docs import docs_group
from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions

from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
Expand Down Expand Up @@ -58,7 +58,7 @@ def __init__(
self,
*,
parser: BeautifulSoupParserType = 'lxml',
**kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
**kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
) -> None:
"""Initialize a new instance.

Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/crawlers/_parsel/_parsel_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from parsel import Selector

from crawlee._utils.docs import docs_group
from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions

from ._parsel_crawling_context import ParselCrawlingContext
from ._parsel_parser import ParselParser
Expand Down Expand Up @@ -56,7 +56,7 @@ async def request_handler(context: ParselCrawlingContext) -> None:

def __init__(
self,
**kwargs: Unpack[BasicCrawlerOptions[ParselCrawlingContext]],
**kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
) -> None:
"""Initialize a new instance.

Expand Down
15 changes: 14 additions & 1 deletion src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import asyncio
import logging
import warnings
from datetime import timedelta
from functools import partial
from typing import TYPE_CHECKING, Any, Generic, Literal

import playwright.async_api
from more_itertools import partition
from pydantic import ValidationError
from typing_extensions import NotRequired, TypedDict, TypeVar
Expand Down Expand Up @@ -106,6 +108,7 @@ def __init__(
fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
headless: bool | None = None,
use_incognito_pages: bool | None = None,
navigation_timeout: timedelta | None = None,
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
) -> None:
"""Initialize a new instance.
Expand Down Expand Up @@ -134,6 +137,8 @@ def __init__(
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
own context that is destroyed once the page is closed or crashes.
This option should not be used if `browser_pool` is provided.
navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Open question - should the navigation_timeout also apply to pre-navigation hooks? Should they have their own limit? Should they share a limit with the request handler?

@B4nan your opinion on this is also welcome. This is due to change in crawlee js v4, so we should be aligned.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would include the hooks, otherwise we would need another two options for timeouts of pre and post nav hooks. They need to be part of some timeout handler, and having 3 options just for the navigation feels too much.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@B4nan include them in navigation_timeout, you mean?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, either that, or separate timeouts over pre and post nav hooks. User code needs to be wrapped in timeout handlers.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say we can start with one shared timeout for all, and adjust later if needed.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, that's how we got here 😁 but yeah, including hook in navigation timeout is fine with me.

the request handler)
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
"""
configuration = kwargs.pop('configuration', None)
Expand Down Expand Up @@ -202,6 +207,8 @@ def __init__(
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)

self._navigation_timeout = navigation_timeout or timedelta(minutes=1)

super().__init__(**kwargs)

async def _open_page(
Expand Down Expand Up @@ -266,6 +273,7 @@ async def _navigate(
Raises:
ValueError: If the browser pool is not initialized.
SessionError: If the URL cannot be loaded by the browser.
TimeoutError: If navigation does not succeed within the navigation timeout.
Yields:
The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
Expand Down Expand Up @@ -297,7 +305,12 @@ async def _navigate(
# Set route_handler only for current request
await context.page.route(context.request.url, route_handler)

response = await context.page.goto(context.request.url)
try:
response = await context.page.goto(
context.request.url, timeout=self._navigation_timeout.total_seconds() * 1000
)
except playwright.async_api.TimeoutError as exc:
raise asyncio.TimeoutError from exc

if response is None:
raise SessionError(f'Failed to load the URL: {context.request.url}')
Expand Down
8 changes: 7 additions & 1 deletion src/crawlee/crawlers/_playwright/_playwright_http_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ async def crawl(
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
statistics: Statistics | None = None,
timeout: timedelta | None = None,
) -> HttpCrawlingResult:
raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')

Expand All @@ -72,6 +73,7 @@ async def send_request(
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
timeout: timedelta | None = None,
) -> HttpResponse:
# `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
# TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
Expand All @@ -87,7 +89,11 @@ async def send_request(

# Proxies appropriate to the browser context are used
response = await browser_context.request.fetch(
url_or_request=url, method=method.lower(), headers=dict(headers) if headers else None, data=payload
url_or_request=url,
method=method.lower(),
headers=dict(headers) if headers else None,
data=payload,
timeout=timeout.total_seconds() if timeout else None,
)

return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')
Expand Down
4 changes: 4 additions & 0 deletions src/crawlee/http_clients/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ async def crawl(
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
statistics: Statistics | None = None,
timeout: timedelta | None = None,
) -> HttpCrawlingResult:
"""Perform the crawling for a given request.

Expand All @@ -114,6 +115,7 @@ async def crawl(
session: The session associated with the request.
proxy_info: The information about the proxy to be used.
statistics: The statistics object to register status codes.
timeout: Request timeout
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something more descriptive? Although not sure whether process is the correct word, I cannot find a better one.

Suggested change
timeout: Request timeout
timeout: Maximum time allowed to process the request.


Raises:
ProxyError: Raised if a proxy-related error occurs.
Expand All @@ -132,6 +134,7 @@ async def send_request(
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
timeout: timedelta | None = None,
) -> HttpResponse:
"""Send an HTTP request via the client.

Expand All @@ -144,6 +147,7 @@ async def send_request(
payload: The data to be sent as the request body.
session: The session associated with the request.
proxy_info: The information about the proxy to be used.
timeout: Request timeout
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something more descriptive? Although not sure whether process is the correct word, I cannot find a better one.

Suggested change
timeout: Request timeout
timeout: Maximum time allowed to process the request.


Raises:
ProxyError: Raised if a proxy-related error occurs.
Expand Down
12 changes: 12 additions & 0 deletions src/crawlee/http_clients/_curl_impersonate.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import asyncio
from contextlib import asynccontextmanager
from typing import TYPE_CHECKING, Any

Expand All @@ -10,6 +11,7 @@
from curl_cffi.requests.cookies import CurlMorsel
from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
from curl_cffi.requests.exceptions import RequestException as CurlRequestError
from curl_cffi.requests.exceptions import Timeout
from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
from typing_extensions import override

Expand Down Expand Up @@ -147,6 +149,7 @@ async def crawl(
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
statistics: Statistics | None = None,
timeout: timedelta | None = None,
) -> HttpCrawlingResult:
client = self._get_client(proxy_info.url if proxy_info else None)

Expand All @@ -157,7 +160,10 @@ async def crawl(
headers=request.headers,
data=request.payload,
cookies=session.cookies.jar if session else None,
timeout=timeout.total_seconds() if timeout else None,
)
except Timeout as exc:
raise asyncio.TimeoutError from exc
except CurlRequestError as exc:
if self._is_proxy_error(exc):
raise ProxyError from exc
Expand Down Expand Up @@ -186,6 +192,7 @@ async def send_request(
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
timeout: timedelta | None = None,
) -> HttpResponse:
if isinstance(headers, dict) or headers is None:
headers = HttpHeaders(headers or {})
Expand All @@ -200,7 +207,10 @@ async def send_request(
headers=dict(headers) if headers else None,
data=payload,
cookies=session.cookies.jar if session else None,
timeout=timeout.total_seconds() if timeout else None,
)
except Timeout as exc:
raise asyncio.TimeoutError from exc
except CurlRequestError as exc:
if self._is_proxy_error(exc):
raise ProxyError from exc
Expand Down Expand Up @@ -241,6 +251,8 @@ async def stream(
stream=True,
timeout=timeout.total_seconds() if timeout else None,
)
except Timeout as exc:
raise asyncio.TimeoutError from exc
except CurlRequestError as exc:
if self._is_proxy_error(exc):
raise ProxyError from exc
Expand Down
Loading