Skip to content

feat: Add ErrorSnapshotter to ErrorTracker #1125

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Apr 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions docs/examples/capturing_page_snapshots_with_error_snapshotter.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
id: capturing-page-snapshots-with-error-snapshotter
title: Capturing page snapshots with ErrorSnapshotter
description: How to capture page snapshots on errors.
---
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
import ApiLink from '@site/src/components/ApiLink';
import ParselCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/parsel_crawler_with_error_snapshotter.py';
import PlaywrightCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_error_snapshotter.py';


This example demonstrates how to capture page snapshots on first occurrence of each unique error. The capturing happens automatically if you set `save_error_snapshots=True` in the crawler's <ApiLink to="class/Statistics">`Statistics`</ApiLink>. The error snapshot can contain `html` file and `jpeg` file that are created from the page where the unhandled exception was raised. Captured error snapshot files are saved to the default key-value store. Both <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and [HTTP crawlers](../guides/http-crawlers) are capable of capturing the html file, but only <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is able to capture page screenshot as well.

<Tabs>
<TabItem value="ParselCrawler" label="ParselCrawler">
<RunnableCodeBlock className="language-python" language="python">
{ ParselCrawlerWithErrorSnapshotter }
</RunnableCodeBlock>
</TabItem>
<TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
<RunnableCodeBlock className="language-python" language="python">
{ PlaywrightCrawlerWithErrorSnapshotter }
</RunnableCodeBlock>
</TabItem>
</Tabs>
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import asyncio
from random import choice

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.statistics import Statistics


async def main() -> None:
crawler = ParselCrawler(
statistics=Statistics.with_default_state(save_error_snapshots=True)
)

@crawler.router.default_handler
async def request_handler(context: ParselCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Simulate various errors to demonstrate `ErrorSnapshotter`
# saving only the first occurrence of unique error.
await context.enqueue_links()
random_number = choice(range(10))
if random_number == 1:
raise KeyError('Some KeyError')
if random_number == 2:
raise ValueError('Some ValueError')
if random_number == 3:
raise RuntimeError('Some RuntimeError')

await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import asyncio
from random import choice

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.statistics import Statistics


async def main() -> None:
crawler = PlaywrightCrawler(
statistics=Statistics.with_default_state(save_error_snapshots=True)
)

@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Simulate various errors to demonstrate `ErrorSnapshotter`
# saving only the first occurrence of unique error.
await context.enqueue_links()
random_number = choice(range(10))
if random_number == 1:
raise KeyError('Some KeyError')
if random_number == 2:
raise ValueError('Some ValueError')
if random_number == 3:
raise RuntimeError('Some RuntimeError')

await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())
4 changes: 2 additions & 2 deletions docs/examples/crawl_specific_links_on_website.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ This example demonstrates how to crawl a website while targeting specific patter

<Tabs groupId="second-example">
<TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
<RunnableCodeBlock className="language-python">
<RunnableCodeBlock className="language-python" language="python">
{BeautifulSoupExampleExtractAndAdd}
</RunnableCodeBlock>
</TabItem>
<TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
<RunnableCodeBlock className="language-python">
<RunnableCodeBlock className="language-python" language="python">
{PlaywrightExampleExtractAndAdd}
</RunnableCodeBlock>
</TabItem>
Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,11 @@ indent-style = "space"
"F841", # Local variable {variable} is assigned to but never used
"N999", # Invalid module name
]
"**/docs/examples/code_examples/*crawler_with_error_snapshotter.py" = [
"PLR2004", # Magic value used in comparison. Ignored for simplicity and readability of example code.
]



[tool.ruff.lint.flake8-quotes]
docstring-quotes = "double"
Expand Down
32 changes: 32 additions & 0 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import dataclasses
from collections.abc import Iterator, Mapping
from dataclasses import dataclass
from enum import Enum
Expand Down Expand Up @@ -559,6 +560,33 @@ def __call__(
"""


@docs_group('Data structures')
@dataclasses.dataclass
class PageSnapshot:
"""Snapshot of a crawled page."""

screenshot: bytes | None = None
"""Screenshot of the page format."""

html: str | None = None
"""HTML content of the page."""

def __bool__(self) -> bool:
return bool(self.screenshot or self.html)


@docs_group('Functions')
class GetPageSnapshot(Protocol):
"""A function for getting snapshot of a page."""

def __call__(self) -> Coroutine[None, None, PageSnapshot]:
"""Get page snapshot.

Returns:
Snapshot of a page.
"""


@docs_group('Functions')
class UseStateFunction(Protocol):
"""A function for managing state within the crawling context.
Expand Down Expand Up @@ -619,6 +647,10 @@ class BasicCrawlingContext:
log: logging.Logger
"""Logger instance."""

async def get_snapshot(self) -> PageSnapshot:
"""Get snapshot of crawled page."""
return PageSnapshot()

def __hash__(self) -> int:
"""Return hash of the context. Each context is considered unique."""
return id(self)
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from typing_extensions import Self, TypeVar

from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, ExtractLinksFunction
from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, ExtractLinksFunction, PageSnapshot
from crawlee._utils.docs import docs_group
from crawlee.http_clients import HttpCrawlingResult, HttpResponse

Expand All @@ -24,6 +24,10 @@ def from_basic_crawling_context(cls, context: BasicCrawlingContext, http_respons
context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}
return cls(http_response=http_response, **context_kwargs)

async def get_snapshot(self) -> PageSnapshot:
"""Get snapshot of crawled page."""
return PageSnapshot(html=self.http_response.read().decode('utf-8'))


@dataclass(frozen=True)
@docs_group('Data structures')
Expand Down
8 changes: 4 additions & 4 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -892,7 +892,7 @@ async def _handle_request_retries(

if self._should_retry_request(context, error):
request.retry_count += 1
self._statistics.error_tracker.add(error)
await self._statistics.error_tracker.add(error=error, context=context)

if self._error_handler:
try:
Expand Down Expand Up @@ -946,7 +946,7 @@ async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingC

async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:
self._logger.exception('Request failed and reached maximum retries', exc_info=error)
self._statistics.error_tracker.add(error)
await self._statistics.error_tracker.add(error=error, context=context)

if self._failed_request_handler:
try:
Expand Down Expand Up @@ -1162,7 +1162,7 @@ async def __run_task_function(self) -> None:
context.request.session_rotation_count += 1

await request_manager.reclaim_request(request)
self._statistics.error_tracker_retry.add(session_error)
await self._statistics.error_tracker_retry.add(error=session_error, context=context)
else:
self._logger.exception('Request failed and reached maximum retries', exc_info=session_error)

Expand All @@ -1176,7 +1176,7 @@ async def __run_task_function(self) -> None:
)

self._statistics.record_request_processing_failure(statistics_id)
self._statistics.error_tracker.add(session_error)
await self._statistics.error_tracker.add(error=session_error, context=context)

except ContextPipelineInterruptedError as interrupted_error:
self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
Expand Down
20 changes: 13 additions & 7 deletions src/crawlee/crawlers/_basic/_context_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

from collections.abc import AsyncGenerator, Awaitable, Generator
from typing import Any, Callable, Generic, cast
from typing import TYPE_CHECKING, Any, Callable, Generic, cast

from typing_extensions import TypeVar

Expand All @@ -15,6 +14,9 @@
SessionError,
)

if TYPE_CHECKING:
from collections.abc import AsyncGenerator, Awaitable, Generator

TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
TMiddlewareCrawlingContext = TypeVar('TMiddlewareCrawlingContext', bound=BasicCrawlingContext)

Expand All @@ -31,7 +33,7 @@ def __init__(
*,
_middleware: Callable[
[TCrawlingContext],
AsyncGenerator[TMiddlewareCrawlingContext, None],
AsyncGenerator[TMiddlewareCrawlingContext, Exception | None],
]
| None = None,
_parent: ContextPipeline[BasicCrawlingContext] | None = None,
Expand All @@ -55,7 +57,8 @@ async def __call__(
Exceptions from the consumer function are wrapped together with the final crawling context.
"""
chain = list(self._middleware_chain())
cleanup_stack = list[AsyncGenerator]()
cleanup_stack: list[AsyncGenerator[Any, Exception | None]] = []
final_consumer_exception: Exception | None = None

try:
for member in reversed(chain):
Expand All @@ -77,14 +80,16 @@ async def __call__(

try:
await final_context_consumer(cast('TCrawlingContext', crawling_context))
except SessionError: # Session errors get special treatment
except SessionError as e: # Session errors get special treatment
final_consumer_exception = e
raise
except Exception as e:
final_consumer_exception = e
raise RequestHandlerError(e, crawling_context) from e
finally:
for middleware_instance in reversed(cleanup_stack):
try:
result = await middleware_instance.__anext__()
result = await middleware_instance.asend(final_consumer_exception)
except StopAsyncIteration: # noqa: PERF203
pass
except ContextPipelineInterruptedError as e:
Expand All @@ -111,7 +116,8 @@ def compose(
"""
return ContextPipeline[TMiddlewareCrawlingContext](
_middleware=cast(
'Callable[[BasicCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, None]]', middleware
'Callable[[BasicCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, Exception | None]]',
middleware,
),
_parent=cast('ContextPipeline[BasicCrawlingContext]', self),
)
10 changes: 7 additions & 3 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ async def _open_page(
async def _navigate(
self,
context: PlaywrightPreNavCrawlingContext,
) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
) -> AsyncGenerator[PlaywrightCrawlingContext, Exception | None]:
"""Execute an HTTP request utilizing the `BrowserPool` and the `Playwright` library.

Args:
Expand Down Expand Up @@ -238,7 +238,7 @@ async def _navigate(

extract_links = self._create_extract_links_function(context)

yield PlaywrightCrawlingContext(
error = yield PlaywrightCrawlingContext(
request=context.request,
session=context.session,
add_requests=context.add_requests,
Expand All @@ -251,11 +251,15 @@ async def _navigate(
page=context.page,
infinite_scroll=lambda: infinite_scroll(context.page),
response=response,
enqueue_links=self._create_enqueue_links_function(context, extract_links),
extract_links=extract_links,
enqueue_links=self._create_enqueue_links_function(context, extract_links),
block_requests=partial(block_requests, page=context.page),
)

# Collect data in case of errors, before the page object is closed.
if error:
await self.statistics.error_tracker.add(error=error, context=context, early=True)

def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContext) -> ExtractLinksFunction:
"""Create a callback function for extracting links from context.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING

from crawlee._types import BasicCrawlingContext
from crawlee._types import BasicCrawlingContext, PageSnapshot
from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
Expand All @@ -25,3 +25,20 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):

block_requests: BlockRequestsFunction
"""Blocks network requests matching specified URL patterns."""

async def get_snapshot(self) -> PageSnapshot:
"""Get snapshot of crawled page."""
html = None
screenshot = None

try:
html = await self.page.content()
except Exception:
self.log.exception(f'Failed to get html snapshot for {self.request.url}.')

try:
screenshot = await self.page.screenshot(full_page=True, type='jpeg')
except Exception:
self.log.exception(f'Failed to get page screenshot for {self.request.url}.')

return PageSnapshot(html=html, screenshot=screenshot)
Loading
Loading