diff --git a/docs/examples/capturing_page_snapshots_with_error_snapshotter.mdx b/docs/examples/capturing_page_snapshots_with_error_snapshotter.mdx
new file mode 100644
index 0000000000..87ff540298
--- /dev/null
+++ b/docs/examples/capturing_page_snapshots_with_error_snapshotter.mdx
@@ -0,0 +1,27 @@
+---
+id: capturing-page-snapshots-with-error-snapshotter
+title: Capturing page snapshots with ErrorSnapshotter
+description: How to capture page snapshots on errors.
+---
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
+import ApiLink from '@site/src/components/ApiLink';
+import ParselCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/parsel_crawler_with_error_snapshotter.py';
+import PlaywrightCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_error_snapshotter.py';
+
+
+This example demonstrates how to capture page snapshots on first occurrence of each unique error. The capturing happens automatically if you set `save_error_snapshots=True` in the crawler's `Statistics`. The error snapshot can contain `html` file and `jpeg` file that are created from the page where the unhandled exception was raised. Captured error snapshot files are saved to the default key-value store. Both `PlaywrightCrawler` and [HTTP crawlers](../guides/http-crawlers) are capable of capturing the html file, but only `PlaywrightCrawler` is able to capture page screenshot as well.
+
+
+
+
+ { ParselCrawlerWithErrorSnapshotter }
+
+
+
+
+ { PlaywrightCrawlerWithErrorSnapshotter }
+
+
+
diff --git a/docs/examples/code_examples/parsel_crawler_with_error_snapshotter.py b/docs/examples/code_examples/parsel_crawler_with_error_snapshotter.py
new file mode 100644
index 0000000000..d7c3674571
--- /dev/null
+++ b/docs/examples/code_examples/parsel_crawler_with_error_snapshotter.py
@@ -0,0 +1,31 @@
+import asyncio
+from random import choice
+
+from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
+from crawlee.statistics import Statistics
+
+
+async def main() -> None:
+ crawler = ParselCrawler(
+ statistics=Statistics.with_default_state(save_error_snapshots=True)
+ )
+
+ @crawler.router.default_handler
+ async def request_handler(context: ParselCrawlingContext) -> None:
+ context.log.info(f'Processing {context.request.url} ...')
+ # Simulate various errors to demonstrate `ErrorSnapshotter`
+ # saving only the first occurrence of unique error.
+ await context.enqueue_links()
+ random_number = choice(range(10))
+ if random_number == 1:
+ raise KeyError('Some KeyError')
+ if random_number == 2:
+ raise ValueError('Some ValueError')
+ if random_number == 3:
+ raise RuntimeError('Some RuntimeError')
+
+ await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/examples/code_examples/playwright_crawler_with_error_snapshotter.py b/docs/examples/code_examples/playwright_crawler_with_error_snapshotter.py
new file mode 100644
index 0000000000..90ddc6c3d4
--- /dev/null
+++ b/docs/examples/code_examples/playwright_crawler_with_error_snapshotter.py
@@ -0,0 +1,31 @@
+import asyncio
+from random import choice
+
+from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
+from crawlee.statistics import Statistics
+
+
+async def main() -> None:
+ crawler = PlaywrightCrawler(
+ statistics=Statistics.with_default_state(save_error_snapshots=True)
+ )
+
+ @crawler.router.default_handler
+ async def request_handler(context: PlaywrightCrawlingContext) -> None:
+ context.log.info(f'Processing {context.request.url} ...')
+ # Simulate various errors to demonstrate `ErrorSnapshotter`
+ # saving only the first occurrence of unique error.
+ await context.enqueue_links()
+ random_number = choice(range(10))
+ if random_number == 1:
+ raise KeyError('Some KeyError')
+ if random_number == 2:
+ raise ValueError('Some ValueError')
+ if random_number == 3:
+ raise RuntimeError('Some RuntimeError')
+
+ await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/examples/crawl_specific_links_on_website.mdx b/docs/examples/crawl_specific_links_on_website.mdx
index c65bd9d897..b350568421 100644
--- a/docs/examples/crawl_specific_links_on_website.mdx
+++ b/docs/examples/crawl_specific_links_on_website.mdx
@@ -35,12 +35,12 @@ This example demonstrates how to crawl a website while targeting specific patter
-
+
{BeautifulSoupExampleExtractAndAdd}
-
+
{PlaywrightExampleExtractAndAdd}
diff --git a/pyproject.toml b/pyproject.toml
index 0021e36e92..c22cae326a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -179,6 +179,11 @@ indent-style = "space"
"F841", # Local variable {variable} is assigned to but never used
"N999", # Invalid module name
]
+"**/docs/examples/code_examples/*crawler_with_error_snapshotter.py" = [
+ "PLR2004", # Magic value used in comparison. Ignored for simplicity and readability of example code.
+]
+
+
[tool.ruff.lint.flake8-quotes]
docstring-quotes = "double"
diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py
index 59f9705967..c68ae63df9 100644
--- a/src/crawlee/_types.py
+++ b/src/crawlee/_types.py
@@ -1,5 +1,6 @@
from __future__ import annotations
+import dataclasses
from collections.abc import Iterator, Mapping
from dataclasses import dataclass
from enum import Enum
@@ -559,6 +560,33 @@ def __call__(
"""
+@docs_group('Data structures')
+@dataclasses.dataclass
+class PageSnapshot:
+ """Snapshot of a crawled page."""
+
+ screenshot: bytes | None = None
+ """Screenshot of the page format."""
+
+ html: str | None = None
+ """HTML content of the page."""
+
+ def __bool__(self) -> bool:
+ return bool(self.screenshot or self.html)
+
+
+@docs_group('Functions')
+class GetPageSnapshot(Protocol):
+ """A function for getting snapshot of a page."""
+
+ def __call__(self) -> Coroutine[None, None, PageSnapshot]:
+ """Get page snapshot.
+
+ Returns:
+ Snapshot of a page.
+ """
+
+
@docs_group('Functions')
class UseStateFunction(Protocol):
"""A function for managing state within the crawling context.
@@ -619,6 +647,10 @@ class BasicCrawlingContext:
log: logging.Logger
"""Logger instance."""
+ async def get_snapshot(self) -> PageSnapshot:
+ """Get snapshot of crawled page."""
+ return PageSnapshot()
+
def __hash__(self) -> int:
"""Return hash of the context. Each context is considered unique."""
return id(self)
diff --git a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py
index 9e6926c421..fb5d6802f9 100644
--- a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py
+++ b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py
@@ -5,7 +5,7 @@
from typing_extensions import Self, TypeVar
-from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, ExtractLinksFunction
+from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, ExtractLinksFunction, PageSnapshot
from crawlee._utils.docs import docs_group
from crawlee.http_clients import HttpCrawlingResult, HttpResponse
@@ -24,6 +24,10 @@ def from_basic_crawling_context(cls, context: BasicCrawlingContext, http_respons
context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}
return cls(http_response=http_response, **context_kwargs)
+ async def get_snapshot(self) -> PageSnapshot:
+ """Get snapshot of crawled page."""
+ return PageSnapshot(html=self.http_response.read().decode('utf-8'))
+
@dataclass(frozen=True)
@docs_group('Data structures')
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 8c76da798e..a196f5e251 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -892,7 +892,7 @@ async def _handle_request_retries(
if self._should_retry_request(context, error):
request.retry_count += 1
- self._statistics.error_tracker.add(error)
+ await self._statistics.error_tracker.add(error=error, context=context)
if self._error_handler:
try:
@@ -946,7 +946,7 @@ async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingC
async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:
self._logger.exception('Request failed and reached maximum retries', exc_info=error)
- self._statistics.error_tracker.add(error)
+ await self._statistics.error_tracker.add(error=error, context=context)
if self._failed_request_handler:
try:
@@ -1162,7 +1162,7 @@ async def __run_task_function(self) -> None:
context.request.session_rotation_count += 1
await request_manager.reclaim_request(request)
- self._statistics.error_tracker_retry.add(session_error)
+ await self._statistics.error_tracker_retry.add(error=session_error, context=context)
else:
self._logger.exception('Request failed and reached maximum retries', exc_info=session_error)
@@ -1176,7 +1176,7 @@ async def __run_task_function(self) -> None:
)
self._statistics.record_request_processing_failure(statistics_id)
- self._statistics.error_tracker.add(session_error)
+ await self._statistics.error_tracker.add(error=session_error, context=context)
except ContextPipelineInterruptedError as interrupted_error:
self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
diff --git a/src/crawlee/crawlers/_basic/_context_pipeline.py b/src/crawlee/crawlers/_basic/_context_pipeline.py
index d5a5b43e3b..b236092541 100644
--- a/src/crawlee/crawlers/_basic/_context_pipeline.py
+++ b/src/crawlee/crawlers/_basic/_context_pipeline.py
@@ -1,7 +1,6 @@
from __future__ import annotations
-from collections.abc import AsyncGenerator, Awaitable, Generator
-from typing import Any, Callable, Generic, cast
+from typing import TYPE_CHECKING, Any, Callable, Generic, cast
from typing_extensions import TypeVar
@@ -15,6 +14,9 @@
SessionError,
)
+if TYPE_CHECKING:
+ from collections.abc import AsyncGenerator, Awaitable, Generator
+
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
TMiddlewareCrawlingContext = TypeVar('TMiddlewareCrawlingContext', bound=BasicCrawlingContext)
@@ -31,7 +33,7 @@ def __init__(
*,
_middleware: Callable[
[TCrawlingContext],
- AsyncGenerator[TMiddlewareCrawlingContext, None],
+ AsyncGenerator[TMiddlewareCrawlingContext, Exception | None],
]
| None = None,
_parent: ContextPipeline[BasicCrawlingContext] | None = None,
@@ -55,7 +57,8 @@ async def __call__(
Exceptions from the consumer function are wrapped together with the final crawling context.
"""
chain = list(self._middleware_chain())
- cleanup_stack = list[AsyncGenerator]()
+ cleanup_stack: list[AsyncGenerator[Any, Exception | None]] = []
+ final_consumer_exception: Exception | None = None
try:
for member in reversed(chain):
@@ -77,14 +80,16 @@ async def __call__(
try:
await final_context_consumer(cast('TCrawlingContext', crawling_context))
- except SessionError: # Session errors get special treatment
+ except SessionError as e: # Session errors get special treatment
+ final_consumer_exception = e
raise
except Exception as e:
+ final_consumer_exception = e
raise RequestHandlerError(e, crawling_context) from e
finally:
for middleware_instance in reversed(cleanup_stack):
try:
- result = await middleware_instance.__anext__()
+ result = await middleware_instance.asend(final_consumer_exception)
except StopAsyncIteration: # noqa: PERF203
pass
except ContextPipelineInterruptedError as e:
@@ -111,7 +116,8 @@ def compose(
"""
return ContextPipeline[TMiddlewareCrawlingContext](
_middleware=cast(
- 'Callable[[BasicCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, None]]', middleware
+ 'Callable[[BasicCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, Exception | None]]',
+ middleware,
),
_parent=cast('ContextPipeline[BasicCrawlingContext]', self),
)
diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
index 0845a59dd6..468ce01e02 100644
--- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py
+++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -202,7 +202,7 @@ async def _open_page(
async def _navigate(
self,
context: PlaywrightPreNavCrawlingContext,
- ) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
+ ) -> AsyncGenerator[PlaywrightCrawlingContext, Exception | None]:
"""Execute an HTTP request utilizing the `BrowserPool` and the `Playwright` library.
Args:
@@ -238,7 +238,7 @@ async def _navigate(
extract_links = self._create_extract_links_function(context)
- yield PlaywrightCrawlingContext(
+ error = yield PlaywrightCrawlingContext(
request=context.request,
session=context.session,
add_requests=context.add_requests,
@@ -251,11 +251,15 @@ async def _navigate(
page=context.page,
infinite_scroll=lambda: infinite_scroll(context.page),
response=response,
- enqueue_links=self._create_enqueue_links_function(context, extract_links),
extract_links=extract_links,
+ enqueue_links=self._create_enqueue_links_function(context, extract_links),
block_requests=partial(block_requests, page=context.page),
)
+ # Collect data in case of errors, before the page object is closed.
+ if error:
+ await self.statistics.error_tracker.add(error=error, context=context, early=True)
+
def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContext) -> ExtractLinksFunction:
"""Create a callback function for extracting links from context.
diff --git a/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py b/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py
index 9e2e864cf2..27b2d21505 100644
--- a/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py
+++ b/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py
@@ -3,7 +3,7 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING
-from crawlee._types import BasicCrawlingContext
+from crawlee._types import BasicCrawlingContext, PageSnapshot
from crawlee._utils.docs import docs_group
if TYPE_CHECKING:
@@ -25,3 +25,20 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
block_requests: BlockRequestsFunction
"""Blocks network requests matching specified URL patterns."""
+
+ async def get_snapshot(self) -> PageSnapshot:
+ """Get snapshot of crawled page."""
+ html = None
+ screenshot = None
+
+ try:
+ html = await self.page.content()
+ except Exception:
+ self.log.exception(f'Failed to get html snapshot for {self.request.url}.')
+
+ try:
+ screenshot = await self.page.screenshot(full_page=True, type='jpeg')
+ except Exception:
+ self.log.exception(f'Failed to get page screenshot for {self.request.url}.')
+
+ return PageSnapshot(html=html, screenshot=screenshot)
diff --git a/src/crawlee/statistics/_error_snapshotter.py b/src/crawlee/statistics/_error_snapshotter.py
new file mode 100644
index 0000000000..52fbb30b18
--- /dev/null
+++ b/src/crawlee/statistics/_error_snapshotter.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import re
+import string
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from crawlee._types import BasicCrawlingContext
+ from crawlee.storages import KeyValueStore
+
+
+class ErrorSnapshotter:
+ MAX_ERROR_CHARACTERS = 30
+ MAX_HASH_LENGTH = 30
+ MAX_FILENAME_LENGTH = 250
+ BASE_MESSAGE = 'An error occurred'
+ SNAPSHOT_PREFIX = 'ERROR_SNAPSHOT'
+ ALLOWED_CHARACTERS = string.ascii_letters + string.digits + '!-_.'
+
+ def __init__(self, kvs: KeyValueStore | None = None) -> None:
+ self._kvs = kvs
+
+ @property
+ def kvs(self) -> KeyValueStore:
+ if not self._kvs:
+ raise RuntimeError('The key value store was not yet set.')
+ return self._kvs
+
+ @kvs.setter
+ def kvs(self, kvs: KeyValueStore) -> None:
+ self._kvs = kvs
+
+ async def capture_snapshot(self, error_message: str, file_and_line: str, context: BasicCrawlingContext) -> None:
+ """Capture error snapshot and save it to key value store.
+
+ It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
+ it returns `KeyValueStoreChangeRecords` which is commited to the key value store only if the `RequestHandler`
+ returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
+ an exception.
+
+ Args:
+ error_message: Used in filename of the snapshot.
+ file_and_line: Used in filename of the snapshot.
+ context: Context that is used to get the snapshot.
+ """
+ if snapshot := await context.get_snapshot():
+ snapshot_base_name = self._get_snapshot_base_name(error_message, file_and_line)
+ snapshot_save_tasks = []
+ if snapshot.html:
+ snapshot_save_tasks.append(
+ asyncio.create_task(self._save_html(snapshot.html, base_name=snapshot_base_name))
+ )
+ if snapshot.screenshot:
+ snapshot_save_tasks.append(
+ asyncio.create_task(self._save_screenshot(snapshot.screenshot, base_name=snapshot_base_name))
+ )
+ await asyncio.gather(*snapshot_save_tasks)
+
+ async def _save_html(self, html: str, base_name: str) -> None:
+ file_name = f'{base_name}.html'
+ await self.kvs.set_value(file_name, html, content_type='text/html')
+
+ async def _save_screenshot(self, screenshot: bytes, base_name: str) -> None:
+ file_name = f'{base_name}.jpg'
+ await self.kvs.set_value(file_name, screenshot, content_type='image/jpeg')
+
+ def _sanitize_filename(self, filename: str) -> str:
+ return re.sub(f'[^{re.escape(self.ALLOWED_CHARACTERS)}]', '', filename[: self.MAX_FILENAME_LENGTH])
+
+ def _get_snapshot_base_name(self, error_message: str, file_and_line: str) -> str:
+ sha1_hash = hashlib.sha1() # noqa:S324 # Collisions related attacks are of no concern here.
+ sha1_hash.update(file_and_line.encode('utf-8'))
+ hashed_file_and_text = sha1_hash.hexdigest()[: self.MAX_HASH_LENGTH]
+ error_message_start = (error_message or self.BASE_MESSAGE)[: self.MAX_ERROR_CHARACTERS]
+ return self._sanitize_filename(f'{self.SNAPSHOT_PREFIX}_{hashed_file_and_text}_{error_message_start}')
diff --git a/src/crawlee/statistics/_error_tracker.py b/src/crawlee/statistics/_error_tracker.py
index e607093b50..4e70aaa2b7 100644
--- a/src/crawlee/statistics/_error_tracker.py
+++ b/src/crawlee/statistics/_error_tracker.py
@@ -5,12 +5,21 @@
import traceback
from collections import Counter, defaultdict
from itertools import zip_longest
-from typing import Union
+from logging import getLogger
+from typing import TYPE_CHECKING, Union
+
+from crawlee.statistics._error_snapshotter import ErrorSnapshotter
+
+if TYPE_CHECKING:
+ from crawlee._types import BasicCrawlingContext
GroupName = Union[str, None]
ErrorFilenameGroups = dict[GroupName, dict[GroupName, Counter[GroupName]]]
+logger = getLogger(__name__)
+
+
class ErrorTracker:
"""Track errors and aggregates their counts by similarity."""
@@ -21,7 +30,9 @@ def __init__(
show_file_and_line_number: bool = True,
show_error_message: bool = True,
show_full_message: bool = False,
+ save_error_snapshots: bool = False,
) -> None:
+ self.error_snapshotter = ErrorSnapshotter() if save_error_snapshots else None
self.show_error_name = show_error_name
self.show_file_and_line_number = show_file_and_line_number
self.show_error_message = show_error_message
@@ -29,11 +40,34 @@ def __init__(
raise ValueError('`show_error_message` must be `True` if `show_full_message` is set to `True`')
self.show_full_message = show_full_message
self._errors: ErrorFilenameGroups = defaultdict(lambda: defaultdict(Counter))
+ self._early_reported_errors = set[int]()
+
+ async def add(
+ self,
+ error: Exception,
+ *,
+ context: BasicCrawlingContext | None = None,
+ early: bool = False,
+ ) -> None:
+ """Add an error in the statistics.
+
+ Args:
+ error: Error to be added to statistics.
+ context: Context used to collect error snapshot.
+ early: Flag indicating that the error is added earlier than usual to have access to resources that will be
+ closed before normal error collection. This prevents double reporting during normal error collection.
+ """
+ if id(error) in self._early_reported_errors:
+ # Error had to be collected earlier before relevant resources are closed.
+ self._early_reported_errors.remove(id(error))
+ return
+
+ if early:
+ self._early_reported_errors.add(id(error))
- def add(self, error: Exception) -> None:
- """Include an error in the statistics."""
error_group_name = error.__class__.__name__ if self.show_error_name else None
error_group_message = self._get_error_message(error)
+ new_error_group_message = '' # In case of wildcard similarity match
error_group_file_and_line = self._get_file_and_line(error)
# First two levels are grouped only in case of exact match.
@@ -58,18 +92,42 @@ def add(self, error: Exception) -> None:
# No similar message found. Create new group.
self._errors[error_group_file_and_line][error_group_name].update([error_group_message])
- def _get_file_and_line(self, error: Exception) -> str | None:
+ if (
+ self._errors[error_group_file_and_line][error_group_name][new_error_group_message or error_group_message]
+ == 1
+ and context is not None
+ ):
+ # Save snapshot only on the first occurrence of the error and only if context and kvs was passed as well.
+ await self._capture_error_snapshot(
+ error_message=new_error_group_message or error_group_message,
+ file_and_line=error_group_file_and_line,
+ context=context,
+ )
+
+ async def _capture_error_snapshot(
+ self, error_message: str, file_and_line: str, context: BasicCrawlingContext
+ ) -> None:
+ if self.error_snapshotter:
+ try:
+ await self.error_snapshotter.capture_snapshot(
+ error_message=error_message, file_and_line=file_and_line, context=context
+ )
+ except Exception:
+ logger.exception(f'Error when trying to collect error snapshot for exception: {error_message}')
+
+ def _get_file_and_line(self, error: Exception) -> str:
if self.show_file_and_line_number:
error_traceback = traceback.extract_tb(error.__traceback__)
- return f'{error_traceback[0].filename.split("/")[-1]}:{error_traceback[0].lineno}'
- return None
+ # Show only the most specific frame.
+ return f'{error_traceback[-1].filename.split("/")[-1]}:{error_traceback[-1].lineno}'
+ return ''
- def _get_error_message(self, error: Exception) -> str | None:
+ def _get_error_message(self, error: Exception) -> str:
if self.show_error_message:
if self.show_full_message:
return str(error.args[0])
return str(error.args[0]).split('\n')[0]
- return None
+ return ''
@property
def unique_error_count(self) -> int:
@@ -108,13 +166,13 @@ def _get_error_repr(self, file_and_line: str | None, name: str | None, message:
return f'{file_and_line_part}{name_part}{message_part}'
@staticmethod
- def _create_generic_message(message_1: str | None, message_2: str | None) -> str | None:
+ def _create_generic_message(message_1: str | None, message_2: str | None) -> str:
"""Create a generic error message from two messages, if they are similar enough.
- Different parts of similar messages are replaced by `_`.
+ Different parts of similar messages are replaced by `***`.
"""
if message_1 is None or message_2 is None:
- return None
+ return ''
replacement_string = '***'
replacement_count = 0
diff --git a/src/crawlee/statistics/_statistics.py b/src/crawlee/statistics/_statistics.py
index 3c078319b5..d07f379d7a 100644
--- a/src/crawlee/statistics/_statistics.py
+++ b/src/crawlee/statistics/_statistics.py
@@ -77,6 +77,7 @@ def __init__(
log_interval: timedelta = timedelta(minutes=1),
state_model: type[TStatisticsState],
statistics_log_format: Literal['table', 'inline'] = 'table',
+ save_error_snapshots: bool = False,
) -> None:
self._id = Statistics.__next_id
Statistics.__next_id += 1
@@ -86,8 +87,8 @@ def __init__(
self._instance_start: datetime | None = None
self._retry_histogram = dict[int, int]()
- self.error_tracker = ErrorTracker()
- self.error_tracker_retry = ErrorTracker()
+ self.error_tracker = ErrorTracker(save_error_snapshots=save_error_snapshots)
+ self.error_tracker_retry = ErrorTracker(save_error_snapshots=False)
self._requests_in_progress = dict[str, RequestProcessingRecord]()
@@ -132,6 +133,7 @@ def with_default_state(
periodic_message_logger: Logger | None = None,
log_interval: timedelta = timedelta(minutes=1),
statistics_log_format: Literal['table', 'inline'] = 'table',
+ save_error_snapshots: bool = False,
) -> Statistics[StatisticsState]:
"""Initialize a new instance with default state model `StatisticsState`."""
return Statistics[StatisticsState](
@@ -144,6 +146,7 @@ def with_default_state(
log_interval=log_interval,
state_model=StatisticsState,
statistics_log_format=statistics_log_format,
+ save_error_snapshots=save_error_snapshots,
)
@property
@@ -169,6 +172,9 @@ async def __aenter__(self) -> Self:
if self._key_value_store is None:
self._key_value_store = await KeyValueStore.open(name=self._persist_state_kvs_name)
+ if self.error_tracker.error_snapshotter:
+ self.error_tracker.error_snapshotter.kvs = self._key_value_store
+
await self._maybe_load_statistics()
event_manager = service_locator.get_event_manager()
event_manager.on(event=Event.PERSIST_STATE, listener=self._persist_state)
diff --git a/tests/unit/_statistics/test_error_tracker.py b/tests/unit/_statistics/test_error_tracker.py
index 1895671267..d7169752f0 100644
--- a/tests/unit/_statistics/test_error_tracker.py
+++ b/tests/unit/_statistics/test_error_tracker.py
@@ -17,7 +17,7 @@
(ErrorTracker(show_error_name=False, show_file_and_line_number=False, show_error_message=False), 1),
],
)
-def test_error_tracker_counts(error_tracker: ErrorTracker, expected_unique_errors: int) -> None:
+async def test_error_tracker_counts(error_tracker: ErrorTracker, expected_unique_errors: int) -> None:
"""Use different settings of `error_tracker` and test unique errors count."""
for error in [
@@ -31,12 +31,12 @@ def test_error_tracker_counts(error_tracker: ErrorTracker, expected_unique_error
try:
raise error # Errors raised on same line
except Exception as e: # noqa:PERF203
- error_tracker.add(e)
+ await error_tracker.add(e)
try:
raise ValueError('Some value error abc') # Same as one previous error, but different line.
except Exception as e:
- error_tracker.add(e)
+ await error_tracker.add(e)
assert error_tracker.total == 5
assert error_tracker.unique_error_count == expected_unique_errors
@@ -50,7 +50,7 @@ def test_error_tracker_counts(error_tracker: ErrorTracker, expected_unique_error
('Some error number 0 0 0', 'Some error number 1 0 1', 'Some error number *** 0 ***'),
],
)
-def test_error_tracker_similar_messages_full_stack(
+async def test_error_tracker_similar_messages_full_stack(
message_1: str, message_2: str, expected_generic_message: str
) -> None:
"""Test that similar messages collapse into same group with generic name that contains wildcard symbols."""
@@ -66,7 +66,7 @@ def test_error_tracker_similar_messages_full_stack(
try:
raise error # Errors raised on the same line
except Exception as e: # noqa:PERF203
- error_tracker.add(e)
+ await error_tracker.add(e)
line = traceback.extract_tb(e.__traceback__)[0].lineno
file_name = __file__.split('/')[-1]
@@ -86,7 +86,7 @@ def test_error_tracker_similar_messages_full_stack(
(False, 'Error line 1'),
],
)
-def test_show_full_message(*, show_full_message: bool, expected_message: str) -> None:
+async def test_show_full_message(*, show_full_message: bool, expected_message: str) -> None:
"""Test error message settings with both options of `show_full_message`."""
error_tracker = ErrorTracker(
show_error_name=False, show_file_and_line_number=False, show_full_message=show_full_message
@@ -95,6 +95,6 @@ def test_show_full_message(*, show_full_message: bool, expected_message: str) ->
try:
raise RuntimeError('Error line 1\n Error line 2') # Errors raised on the same line
except Exception as e:
- error_tracker.add(e)
+ await error_tracker.add(e)
assert error_tracker.get_most_common_errors()[0][0] == expected_message
diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py
index bcb2356a1c..7f00ff2166 100644
--- a/tests/unit/crawlers/_http/test_http_crawler.py
+++ b/tests/unit/crawlers/_http/test_http_crawler.py
@@ -10,6 +10,8 @@
from crawlee import ConcurrencySettings, Request
from crawlee.crawlers import HttpCrawler
from crawlee.sessions import SessionPool
+from crawlee.statistics import Statistics
+from tests.unit.server_endpoints import HELLO_WORLD
if TYPE_CHECKING:
from collections.abc import Awaitable
@@ -522,3 +524,42 @@ async def handler(context: HttpCrawlingContext) -> None:
def test_default_logger() -> None:
assert HttpCrawler().log.name == 'HttpCrawler'
+
+
+async def test_get_snapshot(server_url: URL) -> None:
+ crawler = HttpCrawler()
+
+ snapshot = None
+
+ @crawler.router.default_handler
+ async def request_handler(context: HttpCrawlingContext) -> None:
+ nonlocal snapshot
+ snapshot = await context.get_snapshot()
+
+ await crawler.run([str(server_url)])
+
+ assert snapshot is not None
+ assert snapshot.html is not None
+ assert snapshot.html == HELLO_WORLD.decode('utf8')
+
+
+async def test_error_snapshot_through_statistics(server_url: URL) -> None:
+ crawler = HttpCrawler(statistics=Statistics.with_default_state(save_error_snapshots=True))
+
+ @crawler.router.default_handler
+ async def request_handler(context: HttpCrawlingContext) -> None:
+ raise RuntimeError(rf'Exception /\ with file name unfriendly symbols in {context.request.url}')
+
+ await crawler.run([str(server_url)])
+
+ kvs = await crawler.get_key_value_store()
+ kvs_content = {}
+ async for key_info in kvs.iterate_keys():
+ kvs_content[key_info.key] = await kvs.get_value(key_info.key)
+
+ # One error, three time retried.
+ assert crawler.statistics.error_tracker.total == 3
+ assert crawler.statistics.error_tracker.unique_error_count == 1
+ assert len(kvs_content) == 1
+ assert key_info.key.endswith('.html')
+ assert kvs_content[key_info.key] == HELLO_WORLD.decode('utf8')
diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
index f31ccc8e47..9123c30904 100644
--- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py
+++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
@@ -23,6 +23,9 @@
from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD
from crawlee.proxy_configuration import ProxyConfiguration
from crawlee.sessions import SessionPool
+from crawlee.statistics import Statistics
+from crawlee.statistics._error_snapshotter import ErrorSnapshotter
+from tests.unit.server_endpoints import GENERIC_RESPONSE, HELLO_WORLD
if TYPE_CHECKING:
from pathlib import Path
@@ -492,3 +495,71 @@ async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:
assert fingerprints['window.navigator.userAgent']
assert 'headless' not in fingerprints['window.navigator.userAgent'].lower()
+
+
+async def test_get_snapshot(server_url: URL) -> None:
+ crawler = PlaywrightCrawler()
+
+ snapshot = None
+
+ @crawler.router.default_handler
+ async def request_handler(context: PlaywrightCrawlingContext) -> None:
+ nonlocal snapshot
+ snapshot = await context.get_snapshot()
+
+ await crawler.run([str(server_url)])
+
+ assert snapshot is not None
+ assert snapshot.html is not None
+ assert snapshot.screenshot is not None
+ # Check at least jpeg start and end expected bytes. Content is not relevant for the test.
+ assert snapshot.screenshot.startswith(b'\xff\xd8')
+ assert snapshot.screenshot.endswith(b'\xff\xd9')
+ assert snapshot.html == HELLO_WORLD.decode('utf-8')
+
+
+async def test_error_snapshot_through_statistics(server_url: URL) -> None:
+ """Test correct use of error snapshotter by the Playwright crawler.
+
+ In this test the crawler will visit 4 pages.
+ - 2 x page endpoints will return the same error
+ - homepage endpoint will return unique error
+ - headers endpoint will return no error
+ """
+ max_retries = 2
+ crawler = PlaywrightCrawler(
+ statistics=Statistics.with_default_state(save_error_snapshots=True), max_request_retries=max_retries
+ )
+
+ @crawler.router.default_handler
+ async def request_handler(context: PlaywrightCrawlingContext) -> None:
+ if 'page' in context.request.url:
+ raise RuntimeError('page error')
+ if 'headers' in context.request.url:
+ return
+ raise RuntimeError('home error')
+
+ await crawler.run(
+ [str(server_url), str(server_url / 'page_1'), str(server_url / 'page_2'), str(server_url / 'headers')]
+ )
+
+ kvs = await crawler.get_key_value_store()
+ kvs_content = {}
+
+ async for key_info in kvs.iterate_keys():
+ kvs_content[key_info.key] = await kvs.get_value(key_info.key)
+
+ assert set(key_info.key).issubset(ErrorSnapshotter.ALLOWED_CHARACTERS)
+ if key_info.key.endswith('.jpg'):
+ # Check at least jpeg start and end expected bytes. Content is not relevant for the test.
+ assert kvs_content[key_info.key].startswith(b'\xff\xd8')
+ assert kvs_content[key_info.key].endswith(b'\xff\xd9')
+ elif 'page' in key_info.key:
+ assert kvs_content[key_info.key] == GENERIC_RESPONSE.decode('utf-8')
+ else:
+ assert kvs_content[key_info.key] == HELLO_WORLD.decode('utf-8')
+
+ # Three errors twice retried errors, but only 2 unique -> 4 (2 x (html and jpg)) artifacts expected.
+ assert crawler.statistics.error_tracker.total == 3 * max_retries
+ assert crawler.statistics.error_tracker.unique_error_count == 2
+ assert len(kvs_content) == 4
diff --git a/tests/unit/server.py b/tests/unit/server.py
index 8a17fd3d7b..29e789d013 100644
--- a/tests/unit/server.py
+++ b/tests/unit/server.py
@@ -11,6 +11,8 @@
from uvicorn.server import Server
from yarl import URL
+from tests.unit.server_endpoints import GENERIC_RESPONSE, HELLO_WORLD, INCAPSULA, SECONDARY_INDEX, START_ENQUEUE
+
if TYPE_CHECKING:
from socket import socket
@@ -158,11 +160,7 @@ async def hello_world(send: Send) -> None:
"""Handle basic requests with a simple HTML response."""
await send_html_response(
send,
- b"""
-
- Hello, world!
-
- """,
+ HELLO_WORLD,
)
@@ -256,15 +254,7 @@ async def start_enqueue_endpoint(send: Send) -> None:
"""Handle requests for the main page with links."""
await send_html_response(
send,
- b"""
-
- Hello
-
-
- Link 1
- Link 2
-
- """,
+ START_ENQUEUE,
)
@@ -272,15 +262,7 @@ async def secondary_index_endpoint(send: Send) -> None:
"""Handle requests for the secondary page with links."""
await send_html_response(
send,
- b"""
-
- Hello
-
-
- Link 3
- Link 4
-
- """,
+ SECONDARY_INDEX,
)
@@ -288,15 +270,7 @@ async def incapsula_endpoint(send: Send) -> None:
"""Handle requests for a page with an incapsula iframe."""
await send_html_response(
send,
- b"""
-
- Hello
-
-
-
-
- """,
+ INCAPSULA,
)
@@ -304,14 +278,7 @@ async def generic_response_endpoint(send: Send) -> None:
"""Handle requests with a generic HTML response."""
await send_html_response(
send,
- b"""
-
- Hello
-
-
- Insightful content
-
- """,
+ GENERIC_RESPONSE,
)
diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py
new file mode 100644
index 0000000000..00456d3dcd
--- /dev/null
+++ b/tests/unit/server_endpoints.py
@@ -0,0 +1,43 @@
+# Test server response content for testing
+
+HELLO_WORLD = b"""\
+
+ Hello, world!
+
+
+"""
+
+START_ENQUEUE = b"""\
+
+ Hello
+
+
+ Link 1
+ Link 2
+"""
+
+SECONDARY_INDEX = b"""\
+
+ Hello
+
+
+ Link 3
+ Link 4
+"""
+
+INCAPSULA = b"""\
+
+ Hello
+
+
+
+"""
+
+GENERIC_RESPONSE = b"""\
+
+ Hello
+
+
+ Insightful content
+"""