Skip to content

Commit 9666092

Browse files
authored
feat: Add ErrorSnapshotter to ErrorTracker (#1125)
### Description - Added `ErrorSnapshotter` that can take page snapshot (screenshot or html) on each first encountered unique error. - Added documentation describing how to use it. ### Issues - Closes: #151 ### Testing - Added unit tests. - Example `PlaywrightCrawler` based actor run with `ErrorSnapshotter`: https://console.apify.com/actors/C0lWh1UCQvgdArp6R/runs/UNuaiRWBDgxiJau0U#storage
1 parent 8ca6f0d commit 9666092

19 files changed

+498
-78
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
---
2+
id: capturing-page-snapshots-with-error-snapshotter
3+
title: Capturing page snapshots with ErrorSnapshotter
4+
description: How to capture page snapshots on errors.
5+
---
6+
import Tabs from '@theme/Tabs';
7+
import TabItem from '@theme/TabItem';
8+
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
9+
import ApiLink from '@site/src/components/ApiLink';
10+
import ParselCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/parsel_crawler_with_error_snapshotter.py';
11+
import PlaywrightCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_error_snapshotter.py';
12+
13+
14+
This example demonstrates how to capture page snapshots on first occurrence of each unique error. The capturing happens automatically if you set `save_error_snapshots=True` in the crawler's <ApiLink to="class/Statistics">`Statistics`</ApiLink>. The error snapshot can contain `html` file and `jpeg` file that are created from the page where the unhandled exception was raised. Captured error snapshot files are saved to the default key-value store. Both <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and [HTTP crawlers](../guides/http-crawlers) are capable of capturing the html file, but only <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is able to capture page screenshot as well.
15+
16+
<Tabs>
17+
<TabItem value="ParselCrawler" label="ParselCrawler">
18+
<RunnableCodeBlock className="language-python" language="python">
19+
{ ParselCrawlerWithErrorSnapshotter }
20+
</RunnableCodeBlock>
21+
</TabItem>
22+
<TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
23+
<RunnableCodeBlock className="language-python" language="python">
24+
{ PlaywrightCrawlerWithErrorSnapshotter }
25+
</RunnableCodeBlock>
26+
</TabItem>
27+
</Tabs>
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import asyncio
2+
from random import choice
3+
4+
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
5+
from crawlee.statistics import Statistics
6+
7+
8+
async def main() -> None:
9+
crawler = ParselCrawler(
10+
statistics=Statistics.with_default_state(save_error_snapshots=True)
11+
)
12+
13+
@crawler.router.default_handler
14+
async def request_handler(context: ParselCrawlingContext) -> None:
15+
context.log.info(f'Processing {context.request.url} ...')
16+
# Simulate various errors to demonstrate `ErrorSnapshotter`
17+
# saving only the first occurrence of unique error.
18+
await context.enqueue_links()
19+
random_number = choice(range(10))
20+
if random_number == 1:
21+
raise KeyError('Some KeyError')
22+
if random_number == 2:
23+
raise ValueError('Some ValueError')
24+
if random_number == 3:
25+
raise RuntimeError('Some RuntimeError')
26+
27+
await crawler.run(['https://crawlee.dev'])
28+
29+
30+
if __name__ == '__main__':
31+
asyncio.run(main())
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import asyncio
2+
from random import choice
3+
4+
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
5+
from crawlee.statistics import Statistics
6+
7+
8+
async def main() -> None:
9+
crawler = PlaywrightCrawler(
10+
statistics=Statistics.with_default_state(save_error_snapshots=True)
11+
)
12+
13+
@crawler.router.default_handler
14+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
15+
context.log.info(f'Processing {context.request.url} ...')
16+
# Simulate various errors to demonstrate `ErrorSnapshotter`
17+
# saving only the first occurrence of unique error.
18+
await context.enqueue_links()
19+
random_number = choice(range(10))
20+
if random_number == 1:
21+
raise KeyError('Some KeyError')
22+
if random_number == 2:
23+
raise ValueError('Some ValueError')
24+
if random_number == 3:
25+
raise RuntimeError('Some RuntimeError')
26+
27+
await crawler.run(['https://crawlee.dev'])
28+
29+
30+
if __name__ == '__main__':
31+
asyncio.run(main())

docs/examples/crawl_specific_links_on_website.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,12 @@ This example demonstrates how to crawl a website while targeting specific patter
3535

3636
<Tabs groupId="second-example">
3737
<TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
38-
<RunnableCodeBlock className="language-python">
38+
<RunnableCodeBlock className="language-python" language="python">
3939
{BeautifulSoupExampleExtractAndAdd}
4040
</RunnableCodeBlock>
4141
</TabItem>
4242
<TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
43-
<RunnableCodeBlock className="language-python">
43+
<RunnableCodeBlock className="language-python" language="python">
4444
{PlaywrightExampleExtractAndAdd}
4545
</RunnableCodeBlock>
4646
</TabItem>

pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,11 @@ indent-style = "space"
179179
"F841", # Local variable {variable} is assigned to but never used
180180
"N999", # Invalid module name
181181
]
182+
"**/docs/examples/code_examples/*crawler_with_error_snapshotter.py" = [
183+
"PLR2004", # Magic value used in comparison. Ignored for simplicity and readability of example code.
184+
]
185+
186+
182187

183188
[tool.ruff.lint.flake8-quotes]
184189
docstring-quotes = "double"

src/crawlee/_types.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import dataclasses
34
from collections.abc import Iterator, Mapping
45
from dataclasses import dataclass
56
from enum import Enum
@@ -559,6 +560,33 @@ def __call__(
559560
"""
560561

561562

563+
@docs_group('Data structures')
564+
@dataclasses.dataclass
565+
class PageSnapshot:
566+
"""Snapshot of a crawled page."""
567+
568+
screenshot: bytes | None = None
569+
"""Screenshot of the page format."""
570+
571+
html: str | None = None
572+
"""HTML content of the page."""
573+
574+
def __bool__(self) -> bool:
575+
return bool(self.screenshot or self.html)
576+
577+
578+
@docs_group('Functions')
579+
class GetPageSnapshot(Protocol):
580+
"""A function for getting snapshot of a page."""
581+
582+
def __call__(self) -> Coroutine[None, None, PageSnapshot]:
583+
"""Get page snapshot.
584+
585+
Returns:
586+
Snapshot of a page.
587+
"""
588+
589+
562590
@docs_group('Functions')
563591
class UseStateFunction(Protocol):
564592
"""A function for managing state within the crawling context.
@@ -619,6 +647,10 @@ class BasicCrawlingContext:
619647
log: logging.Logger
620648
"""Logger instance."""
621649

650+
async def get_snapshot(self) -> PageSnapshot:
651+
"""Get snapshot of crawled page."""
652+
return PageSnapshot()
653+
622654
def __hash__(self) -> int:
623655
"""Return hash of the context. Each context is considered unique."""
624656
return id(self)

src/crawlee/crawlers/_abstract_http/_http_crawling_context.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from typing_extensions import Self, TypeVar
77

8-
from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, ExtractLinksFunction
8+
from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, ExtractLinksFunction, PageSnapshot
99
from crawlee._utils.docs import docs_group
1010
from crawlee.http_clients import HttpCrawlingResult, HttpResponse
1111

@@ -24,6 +24,10 @@ def from_basic_crawling_context(cls, context: BasicCrawlingContext, http_respons
2424
context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}
2525
return cls(http_response=http_response, **context_kwargs)
2626

27+
async def get_snapshot(self) -> PageSnapshot:
28+
"""Get snapshot of crawled page."""
29+
return PageSnapshot(html=self.http_response.read().decode('utf-8'))
30+
2731

2832
@dataclass(frozen=True)
2933
@docs_group('Data structures')

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -892,7 +892,7 @@ async def _handle_request_retries(
892892

893893
if self._should_retry_request(context, error):
894894
request.retry_count += 1
895-
self._statistics.error_tracker.add(error)
895+
await self._statistics.error_tracker.add(error=error, context=context)
896896

897897
if self._error_handler:
898898
try:
@@ -946,7 +946,7 @@ async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingC
946946

947947
async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:
948948
self._logger.exception('Request failed and reached maximum retries', exc_info=error)
949-
self._statistics.error_tracker.add(error)
949+
await self._statistics.error_tracker.add(error=error, context=context)
950950

951951
if self._failed_request_handler:
952952
try:
@@ -1162,7 +1162,7 @@ async def __run_task_function(self) -> None:
11621162
context.request.session_rotation_count += 1
11631163

11641164
await request_manager.reclaim_request(request)
1165-
self._statistics.error_tracker_retry.add(session_error)
1165+
await self._statistics.error_tracker_retry.add(error=session_error, context=context)
11661166
else:
11671167
self._logger.exception('Request failed and reached maximum retries', exc_info=session_error)
11681168

@@ -1176,7 +1176,7 @@ async def __run_task_function(self) -> None:
11761176
)
11771177

11781178
self._statistics.record_request_processing_failure(statistics_id)
1179-
self._statistics.error_tracker.add(session_error)
1179+
await self._statistics.error_tracker.add(error=session_error, context=context)
11801180

11811181
except ContextPipelineInterruptedError as interrupted_error:
11821182
self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)

src/crawlee/crawlers/_basic/_context_pipeline.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from __future__ import annotations
22

3-
from collections.abc import AsyncGenerator, Awaitable, Generator
4-
from typing import Any, Callable, Generic, cast
3+
from typing import TYPE_CHECKING, Any, Callable, Generic, cast
54

65
from typing_extensions import TypeVar
76

@@ -15,6 +14,9 @@
1514
SessionError,
1615
)
1716

17+
if TYPE_CHECKING:
18+
from collections.abc import AsyncGenerator, Awaitable, Generator
19+
1820
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
1921
TMiddlewareCrawlingContext = TypeVar('TMiddlewareCrawlingContext', bound=BasicCrawlingContext)
2022

@@ -31,7 +33,7 @@ def __init__(
3133
*,
3234
_middleware: Callable[
3335
[TCrawlingContext],
34-
AsyncGenerator[TMiddlewareCrawlingContext, None],
36+
AsyncGenerator[TMiddlewareCrawlingContext, Exception | None],
3537
]
3638
| None = None,
3739
_parent: ContextPipeline[BasicCrawlingContext] | None = None,
@@ -55,7 +57,8 @@ async def __call__(
5557
Exceptions from the consumer function are wrapped together with the final crawling context.
5658
"""
5759
chain = list(self._middleware_chain())
58-
cleanup_stack = list[AsyncGenerator]()
60+
cleanup_stack: list[AsyncGenerator[Any, Exception | None]] = []
61+
final_consumer_exception: Exception | None = None
5962

6063
try:
6164
for member in reversed(chain):
@@ -77,14 +80,16 @@ async def __call__(
7780

7881
try:
7982
await final_context_consumer(cast('TCrawlingContext', crawling_context))
80-
except SessionError: # Session errors get special treatment
83+
except SessionError as e: # Session errors get special treatment
84+
final_consumer_exception = e
8185
raise
8286
except Exception as e:
87+
final_consumer_exception = e
8388
raise RequestHandlerError(e, crawling_context) from e
8489
finally:
8590
for middleware_instance in reversed(cleanup_stack):
8691
try:
87-
result = await middleware_instance.__anext__()
92+
result = await middleware_instance.asend(final_consumer_exception)
8893
except StopAsyncIteration: # noqa: PERF203
8994
pass
9095
except ContextPipelineInterruptedError as e:
@@ -111,7 +116,8 @@ def compose(
111116
"""
112117
return ContextPipeline[TMiddlewareCrawlingContext](
113118
_middleware=cast(
114-
'Callable[[BasicCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, None]]', middleware
119+
'Callable[[BasicCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, Exception | None]]',
120+
middleware,
115121
),
116122
_parent=cast('ContextPipeline[BasicCrawlingContext]', self),
117123
)

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ async def _open_page(
202202
async def _navigate(
203203
self,
204204
context: PlaywrightPreNavCrawlingContext,
205-
) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
205+
) -> AsyncGenerator[PlaywrightCrawlingContext, Exception | None]:
206206
"""Execute an HTTP request utilizing the `BrowserPool` and the `Playwright` library.
207207
208208
Args:
@@ -238,7 +238,7 @@ async def _navigate(
238238

239239
extract_links = self._create_extract_links_function(context)
240240

241-
yield PlaywrightCrawlingContext(
241+
error = yield PlaywrightCrawlingContext(
242242
request=context.request,
243243
session=context.session,
244244
add_requests=context.add_requests,
@@ -251,11 +251,15 @@ async def _navigate(
251251
page=context.page,
252252
infinite_scroll=lambda: infinite_scroll(context.page),
253253
response=response,
254-
enqueue_links=self._create_enqueue_links_function(context, extract_links),
255254
extract_links=extract_links,
255+
enqueue_links=self._create_enqueue_links_function(context, extract_links),
256256
block_requests=partial(block_requests, page=context.page),
257257
)
258258

259+
# Collect data in case of errors, before the page object is closed.
260+
if error:
261+
await self.statistics.error_tracker.add(error=error, context=context, early=True)
262+
259263
def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContext) -> ExtractLinksFunction:
260264
"""Create a callback function for extracting links from context.
261265

0 commit comments

Comments
 (0)