diff --git a/docs/examples/capturing_page_snapshots_with_error_snapshotter.mdx b/docs/examples/capturing_page_snapshots_with_error_snapshotter.mdx new file mode 100644 index 0000000000..87ff540298 --- /dev/null +++ b/docs/examples/capturing_page_snapshots_with_error_snapshotter.mdx @@ -0,0 +1,27 @@ +--- +id: capturing-page-snapshots-with-error-snapshotter +title: Capturing page snapshots with ErrorSnapshotter +description: How to capture page snapshots on errors. +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; +import ApiLink from '@site/src/components/ApiLink'; +import ParselCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/parsel_crawler_with_error_snapshotter.py'; +import PlaywrightCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_error_snapshotter.py'; + + +This example demonstrates how to capture page snapshots on first occurrence of each unique error. The capturing happens automatically if you set `save_error_snapshots=True` in the crawler's `Statistics`. The error snapshot can contain `html` file and `jpeg` file that are created from the page where the unhandled exception was raised. Captured error snapshot files are saved to the default key-value store. Both `PlaywrightCrawler` and [HTTP crawlers](../guides/http-crawlers) are capable of capturing the html file, but only `PlaywrightCrawler` is able to capture page screenshot as well. + + + + + { ParselCrawlerWithErrorSnapshotter } + + + + + { PlaywrightCrawlerWithErrorSnapshotter } + + + diff --git a/docs/examples/code_examples/parsel_crawler_with_error_snapshotter.py b/docs/examples/code_examples/parsel_crawler_with_error_snapshotter.py new file mode 100644 index 0000000000..d7c3674571 --- /dev/null +++ b/docs/examples/code_examples/parsel_crawler_with_error_snapshotter.py @@ -0,0 +1,31 @@ +import asyncio +from random import choice + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.statistics import Statistics + + +async def main() -> None: + crawler = ParselCrawler( + statistics=Statistics.with_default_state(save_error_snapshots=True) + ) + + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # Simulate various errors to demonstrate `ErrorSnapshotter` + # saving only the first occurrence of unique error. + await context.enqueue_links() + random_number = choice(range(10)) + if random_number == 1: + raise KeyError('Some KeyError') + if random_number == 2: + raise ValueError('Some ValueError') + if random_number == 3: + raise RuntimeError('Some RuntimeError') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/examples/code_examples/playwright_crawler_with_error_snapshotter.py b/docs/examples/code_examples/playwright_crawler_with_error_snapshotter.py new file mode 100644 index 0000000000..90ddc6c3d4 --- /dev/null +++ b/docs/examples/code_examples/playwright_crawler_with_error_snapshotter.py @@ -0,0 +1,31 @@ +import asyncio +from random import choice + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.statistics import Statistics + + +async def main() -> None: + crawler = PlaywrightCrawler( + statistics=Statistics.with_default_state(save_error_snapshots=True) + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # Simulate various errors to demonstrate `ErrorSnapshotter` + # saving only the first occurrence of unique error. + await context.enqueue_links() + random_number = choice(range(10)) + if random_number == 1: + raise KeyError('Some KeyError') + if random_number == 2: + raise ValueError('Some ValueError') + if random_number == 3: + raise RuntimeError('Some RuntimeError') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/examples/crawl_specific_links_on_website.mdx b/docs/examples/crawl_specific_links_on_website.mdx index c65bd9d897..b350568421 100644 --- a/docs/examples/crawl_specific_links_on_website.mdx +++ b/docs/examples/crawl_specific_links_on_website.mdx @@ -35,12 +35,12 @@ This example demonstrates how to crawl a website while targeting specific patter - + {BeautifulSoupExampleExtractAndAdd} - + {PlaywrightExampleExtractAndAdd} diff --git a/pyproject.toml b/pyproject.toml index 0021e36e92..c22cae326a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -179,6 +179,11 @@ indent-style = "space" "F841", # Local variable {variable} is assigned to but never used "N999", # Invalid module name ] +"**/docs/examples/code_examples/*crawler_with_error_snapshotter.py" = [ + "PLR2004", # Magic value used in comparison. Ignored for simplicity and readability of example code. +] + + [tool.ruff.lint.flake8-quotes] docstring-quotes = "double" diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 59f9705967..c68ae63df9 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -1,5 +1,6 @@ from __future__ import annotations +import dataclasses from collections.abc import Iterator, Mapping from dataclasses import dataclass from enum import Enum @@ -559,6 +560,33 @@ def __call__( """ +@docs_group('Data structures') +@dataclasses.dataclass +class PageSnapshot: + """Snapshot of a crawled page.""" + + screenshot: bytes | None = None + """Screenshot of the page format.""" + + html: str | None = None + """HTML content of the page.""" + + def __bool__(self) -> bool: + return bool(self.screenshot or self.html) + + +@docs_group('Functions') +class GetPageSnapshot(Protocol): + """A function for getting snapshot of a page.""" + + def __call__(self) -> Coroutine[None, None, PageSnapshot]: + """Get page snapshot. + + Returns: + Snapshot of a page. + """ + + @docs_group('Functions') class UseStateFunction(Protocol): """A function for managing state within the crawling context. @@ -619,6 +647,10 @@ class BasicCrawlingContext: log: logging.Logger """Logger instance.""" + async def get_snapshot(self) -> PageSnapshot: + """Get snapshot of crawled page.""" + return PageSnapshot() + def __hash__(self) -> int: """Return hash of the context. Each context is considered unique.""" return id(self) diff --git a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py index 9e6926c421..fb5d6802f9 100644 --- a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py +++ b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py @@ -5,7 +5,7 @@ from typing_extensions import Self, TypeVar -from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, ExtractLinksFunction +from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, ExtractLinksFunction, PageSnapshot from crawlee._utils.docs import docs_group from crawlee.http_clients import HttpCrawlingResult, HttpResponse @@ -24,6 +24,10 @@ def from_basic_crawling_context(cls, context: BasicCrawlingContext, http_respons context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} return cls(http_response=http_response, **context_kwargs) + async def get_snapshot(self) -> PageSnapshot: + """Get snapshot of crawled page.""" + return PageSnapshot(html=self.http_response.read().decode('utf-8')) + @dataclass(frozen=True) @docs_group('Data structures') diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 8c76da798e..a196f5e251 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -892,7 +892,7 @@ async def _handle_request_retries( if self._should_retry_request(context, error): request.retry_count += 1 - self._statistics.error_tracker.add(error) + await self._statistics.error_tracker.add(error=error, context=context) if self._error_handler: try: @@ -946,7 +946,7 @@ async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingC async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None: self._logger.exception('Request failed and reached maximum retries', exc_info=error) - self._statistics.error_tracker.add(error) + await self._statistics.error_tracker.add(error=error, context=context) if self._failed_request_handler: try: @@ -1162,7 +1162,7 @@ async def __run_task_function(self) -> None: context.request.session_rotation_count += 1 await request_manager.reclaim_request(request) - self._statistics.error_tracker_retry.add(session_error) + await self._statistics.error_tracker_retry.add(error=session_error, context=context) else: self._logger.exception('Request failed and reached maximum retries', exc_info=session_error) @@ -1176,7 +1176,7 @@ async def __run_task_function(self) -> None: ) self._statistics.record_request_processing_failure(statistics_id) - self._statistics.error_tracker.add(session_error) + await self._statistics.error_tracker.add(error=session_error, context=context) except ContextPipelineInterruptedError as interrupted_error: self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error) diff --git a/src/crawlee/crawlers/_basic/_context_pipeline.py b/src/crawlee/crawlers/_basic/_context_pipeline.py index d5a5b43e3b..b236092541 100644 --- a/src/crawlee/crawlers/_basic/_context_pipeline.py +++ b/src/crawlee/crawlers/_basic/_context_pipeline.py @@ -1,7 +1,6 @@ from __future__ import annotations -from collections.abc import AsyncGenerator, Awaitable, Generator -from typing import Any, Callable, Generic, cast +from typing import TYPE_CHECKING, Any, Callable, Generic, cast from typing_extensions import TypeVar @@ -15,6 +14,9 @@ SessionError, ) +if TYPE_CHECKING: + from collections.abc import AsyncGenerator, Awaitable, Generator + TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext) TMiddlewareCrawlingContext = TypeVar('TMiddlewareCrawlingContext', bound=BasicCrawlingContext) @@ -31,7 +33,7 @@ def __init__( *, _middleware: Callable[ [TCrawlingContext], - AsyncGenerator[TMiddlewareCrawlingContext, None], + AsyncGenerator[TMiddlewareCrawlingContext, Exception | None], ] | None = None, _parent: ContextPipeline[BasicCrawlingContext] | None = None, @@ -55,7 +57,8 @@ async def __call__( Exceptions from the consumer function are wrapped together with the final crawling context. """ chain = list(self._middleware_chain()) - cleanup_stack = list[AsyncGenerator]() + cleanup_stack: list[AsyncGenerator[Any, Exception | None]] = [] + final_consumer_exception: Exception | None = None try: for member in reversed(chain): @@ -77,14 +80,16 @@ async def __call__( try: await final_context_consumer(cast('TCrawlingContext', crawling_context)) - except SessionError: # Session errors get special treatment + except SessionError as e: # Session errors get special treatment + final_consumer_exception = e raise except Exception as e: + final_consumer_exception = e raise RequestHandlerError(e, crawling_context) from e finally: for middleware_instance in reversed(cleanup_stack): try: - result = await middleware_instance.__anext__() + result = await middleware_instance.asend(final_consumer_exception) except StopAsyncIteration: # noqa: PERF203 pass except ContextPipelineInterruptedError as e: @@ -111,7 +116,8 @@ def compose( """ return ContextPipeline[TMiddlewareCrawlingContext]( _middleware=cast( - 'Callable[[BasicCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, None]]', middleware + 'Callable[[BasicCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, Exception | None]]', + middleware, ), _parent=cast('ContextPipeline[BasicCrawlingContext]', self), ) diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 0845a59dd6..468ce01e02 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -202,7 +202,7 @@ async def _open_page( async def _navigate( self, context: PlaywrightPreNavCrawlingContext, - ) -> AsyncGenerator[PlaywrightCrawlingContext, None]: + ) -> AsyncGenerator[PlaywrightCrawlingContext, Exception | None]: """Execute an HTTP request utilizing the `BrowserPool` and the `Playwright` library. Args: @@ -238,7 +238,7 @@ async def _navigate( extract_links = self._create_extract_links_function(context) - yield PlaywrightCrawlingContext( + error = yield PlaywrightCrawlingContext( request=context.request, session=context.session, add_requests=context.add_requests, @@ -251,11 +251,15 @@ async def _navigate( page=context.page, infinite_scroll=lambda: infinite_scroll(context.page), response=response, - enqueue_links=self._create_enqueue_links_function(context, extract_links), extract_links=extract_links, + enqueue_links=self._create_enqueue_links_function(context, extract_links), block_requests=partial(block_requests, page=context.page), ) + # Collect data in case of errors, before the page object is closed. + if error: + await self.statistics.error_tracker.add(error=error, context=context, early=True) + def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContext) -> ExtractLinksFunction: """Create a callback function for extracting links from context. diff --git a/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py b/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py index 9e2e864cf2..27b2d21505 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +++ b/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING -from crawlee._types import BasicCrawlingContext +from crawlee._types import BasicCrawlingContext, PageSnapshot from crawlee._utils.docs import docs_group if TYPE_CHECKING: @@ -25,3 +25,20 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext): block_requests: BlockRequestsFunction """Blocks network requests matching specified URL patterns.""" + + async def get_snapshot(self) -> PageSnapshot: + """Get snapshot of crawled page.""" + html = None + screenshot = None + + try: + html = await self.page.content() + except Exception: + self.log.exception(f'Failed to get html snapshot for {self.request.url}.') + + try: + screenshot = await self.page.screenshot(full_page=True, type='jpeg') + except Exception: + self.log.exception(f'Failed to get page screenshot for {self.request.url}.') + + return PageSnapshot(html=html, screenshot=screenshot) diff --git a/src/crawlee/statistics/_error_snapshotter.py b/src/crawlee/statistics/_error_snapshotter.py new file mode 100644 index 0000000000..52fbb30b18 --- /dev/null +++ b/src/crawlee/statistics/_error_snapshotter.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +import asyncio +import hashlib +import re +import string +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from crawlee._types import BasicCrawlingContext + from crawlee.storages import KeyValueStore + + +class ErrorSnapshotter: + MAX_ERROR_CHARACTERS = 30 + MAX_HASH_LENGTH = 30 + MAX_FILENAME_LENGTH = 250 + BASE_MESSAGE = 'An error occurred' + SNAPSHOT_PREFIX = 'ERROR_SNAPSHOT' + ALLOWED_CHARACTERS = string.ascii_letters + string.digits + '!-_.' + + def __init__(self, kvs: KeyValueStore | None = None) -> None: + self._kvs = kvs + + @property + def kvs(self) -> KeyValueStore: + if not self._kvs: + raise RuntimeError('The key value store was not yet set.') + return self._kvs + + @kvs.setter + def kvs(self, kvs: KeyValueStore) -> None: + self._kvs = kvs + + async def capture_snapshot(self, error_message: str, file_and_line: str, context: BasicCrawlingContext) -> None: + """Capture error snapshot and save it to key value store. + + It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because + it returns `KeyValueStoreChangeRecords` which is commited to the key value store only if the `RequestHandler` + returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with + an exception. + + Args: + error_message: Used in filename of the snapshot. + file_and_line: Used in filename of the snapshot. + context: Context that is used to get the snapshot. + """ + if snapshot := await context.get_snapshot(): + snapshot_base_name = self._get_snapshot_base_name(error_message, file_and_line) + snapshot_save_tasks = [] + if snapshot.html: + snapshot_save_tasks.append( + asyncio.create_task(self._save_html(snapshot.html, base_name=snapshot_base_name)) + ) + if snapshot.screenshot: + snapshot_save_tasks.append( + asyncio.create_task(self._save_screenshot(snapshot.screenshot, base_name=snapshot_base_name)) + ) + await asyncio.gather(*snapshot_save_tasks) + + async def _save_html(self, html: str, base_name: str) -> None: + file_name = f'{base_name}.html' + await self.kvs.set_value(file_name, html, content_type='text/html') + + async def _save_screenshot(self, screenshot: bytes, base_name: str) -> None: + file_name = f'{base_name}.jpg' + await self.kvs.set_value(file_name, screenshot, content_type='image/jpeg') + + def _sanitize_filename(self, filename: str) -> str: + return re.sub(f'[^{re.escape(self.ALLOWED_CHARACTERS)}]', '', filename[: self.MAX_FILENAME_LENGTH]) + + def _get_snapshot_base_name(self, error_message: str, file_and_line: str) -> str: + sha1_hash = hashlib.sha1() # noqa:S324 # Collisions related attacks are of no concern here. + sha1_hash.update(file_and_line.encode('utf-8')) + hashed_file_and_text = sha1_hash.hexdigest()[: self.MAX_HASH_LENGTH] + error_message_start = (error_message or self.BASE_MESSAGE)[: self.MAX_ERROR_CHARACTERS] + return self._sanitize_filename(f'{self.SNAPSHOT_PREFIX}_{hashed_file_and_text}_{error_message_start}') diff --git a/src/crawlee/statistics/_error_tracker.py b/src/crawlee/statistics/_error_tracker.py index e607093b50..4e70aaa2b7 100644 --- a/src/crawlee/statistics/_error_tracker.py +++ b/src/crawlee/statistics/_error_tracker.py @@ -5,12 +5,21 @@ import traceback from collections import Counter, defaultdict from itertools import zip_longest -from typing import Union +from logging import getLogger +from typing import TYPE_CHECKING, Union + +from crawlee.statistics._error_snapshotter import ErrorSnapshotter + +if TYPE_CHECKING: + from crawlee._types import BasicCrawlingContext GroupName = Union[str, None] ErrorFilenameGroups = dict[GroupName, dict[GroupName, Counter[GroupName]]] +logger = getLogger(__name__) + + class ErrorTracker: """Track errors and aggregates their counts by similarity.""" @@ -21,7 +30,9 @@ def __init__( show_file_and_line_number: bool = True, show_error_message: bool = True, show_full_message: bool = False, + save_error_snapshots: bool = False, ) -> None: + self.error_snapshotter = ErrorSnapshotter() if save_error_snapshots else None self.show_error_name = show_error_name self.show_file_and_line_number = show_file_and_line_number self.show_error_message = show_error_message @@ -29,11 +40,34 @@ def __init__( raise ValueError('`show_error_message` must be `True` if `show_full_message` is set to `True`') self.show_full_message = show_full_message self._errors: ErrorFilenameGroups = defaultdict(lambda: defaultdict(Counter)) + self._early_reported_errors = set[int]() + + async def add( + self, + error: Exception, + *, + context: BasicCrawlingContext | None = None, + early: bool = False, + ) -> None: + """Add an error in the statistics. + + Args: + error: Error to be added to statistics. + context: Context used to collect error snapshot. + early: Flag indicating that the error is added earlier than usual to have access to resources that will be + closed before normal error collection. This prevents double reporting during normal error collection. + """ + if id(error) in self._early_reported_errors: + # Error had to be collected earlier before relevant resources are closed. + self._early_reported_errors.remove(id(error)) + return + + if early: + self._early_reported_errors.add(id(error)) - def add(self, error: Exception) -> None: - """Include an error in the statistics.""" error_group_name = error.__class__.__name__ if self.show_error_name else None error_group_message = self._get_error_message(error) + new_error_group_message = '' # In case of wildcard similarity match error_group_file_and_line = self._get_file_and_line(error) # First two levels are grouped only in case of exact match. @@ -58,18 +92,42 @@ def add(self, error: Exception) -> None: # No similar message found. Create new group. self._errors[error_group_file_and_line][error_group_name].update([error_group_message]) - def _get_file_and_line(self, error: Exception) -> str | None: + if ( + self._errors[error_group_file_and_line][error_group_name][new_error_group_message or error_group_message] + == 1 + and context is not None + ): + # Save snapshot only on the first occurrence of the error and only if context and kvs was passed as well. + await self._capture_error_snapshot( + error_message=new_error_group_message or error_group_message, + file_and_line=error_group_file_and_line, + context=context, + ) + + async def _capture_error_snapshot( + self, error_message: str, file_and_line: str, context: BasicCrawlingContext + ) -> None: + if self.error_snapshotter: + try: + await self.error_snapshotter.capture_snapshot( + error_message=error_message, file_and_line=file_and_line, context=context + ) + except Exception: + logger.exception(f'Error when trying to collect error snapshot for exception: {error_message}') + + def _get_file_and_line(self, error: Exception) -> str: if self.show_file_and_line_number: error_traceback = traceback.extract_tb(error.__traceback__) - return f'{error_traceback[0].filename.split("/")[-1]}:{error_traceback[0].lineno}' - return None + # Show only the most specific frame. + return f'{error_traceback[-1].filename.split("/")[-1]}:{error_traceback[-1].lineno}' + return '' - def _get_error_message(self, error: Exception) -> str | None: + def _get_error_message(self, error: Exception) -> str: if self.show_error_message: if self.show_full_message: return str(error.args[0]) return str(error.args[0]).split('\n')[0] - return None + return '' @property def unique_error_count(self) -> int: @@ -108,13 +166,13 @@ def _get_error_repr(self, file_and_line: str | None, name: str | None, message: return f'{file_and_line_part}{name_part}{message_part}' @staticmethod - def _create_generic_message(message_1: str | None, message_2: str | None) -> str | None: + def _create_generic_message(message_1: str | None, message_2: str | None) -> str: """Create a generic error message from two messages, if they are similar enough. - Different parts of similar messages are replaced by `_`. + Different parts of similar messages are replaced by `***`. """ if message_1 is None or message_2 is None: - return None + return '' replacement_string = '***' replacement_count = 0 diff --git a/src/crawlee/statistics/_statistics.py b/src/crawlee/statistics/_statistics.py index 3c078319b5..d07f379d7a 100644 --- a/src/crawlee/statistics/_statistics.py +++ b/src/crawlee/statistics/_statistics.py @@ -77,6 +77,7 @@ def __init__( log_interval: timedelta = timedelta(minutes=1), state_model: type[TStatisticsState], statistics_log_format: Literal['table', 'inline'] = 'table', + save_error_snapshots: bool = False, ) -> None: self._id = Statistics.__next_id Statistics.__next_id += 1 @@ -86,8 +87,8 @@ def __init__( self._instance_start: datetime | None = None self._retry_histogram = dict[int, int]() - self.error_tracker = ErrorTracker() - self.error_tracker_retry = ErrorTracker() + self.error_tracker = ErrorTracker(save_error_snapshots=save_error_snapshots) + self.error_tracker_retry = ErrorTracker(save_error_snapshots=False) self._requests_in_progress = dict[str, RequestProcessingRecord]() @@ -132,6 +133,7 @@ def with_default_state( periodic_message_logger: Logger | None = None, log_interval: timedelta = timedelta(minutes=1), statistics_log_format: Literal['table', 'inline'] = 'table', + save_error_snapshots: bool = False, ) -> Statistics[StatisticsState]: """Initialize a new instance with default state model `StatisticsState`.""" return Statistics[StatisticsState]( @@ -144,6 +146,7 @@ def with_default_state( log_interval=log_interval, state_model=StatisticsState, statistics_log_format=statistics_log_format, + save_error_snapshots=save_error_snapshots, ) @property @@ -169,6 +172,9 @@ async def __aenter__(self) -> Self: if self._key_value_store is None: self._key_value_store = await KeyValueStore.open(name=self._persist_state_kvs_name) + if self.error_tracker.error_snapshotter: + self.error_tracker.error_snapshotter.kvs = self._key_value_store + await self._maybe_load_statistics() event_manager = service_locator.get_event_manager() event_manager.on(event=Event.PERSIST_STATE, listener=self._persist_state) diff --git a/tests/unit/_statistics/test_error_tracker.py b/tests/unit/_statistics/test_error_tracker.py index 1895671267..d7169752f0 100644 --- a/tests/unit/_statistics/test_error_tracker.py +++ b/tests/unit/_statistics/test_error_tracker.py @@ -17,7 +17,7 @@ (ErrorTracker(show_error_name=False, show_file_and_line_number=False, show_error_message=False), 1), ], ) -def test_error_tracker_counts(error_tracker: ErrorTracker, expected_unique_errors: int) -> None: +async def test_error_tracker_counts(error_tracker: ErrorTracker, expected_unique_errors: int) -> None: """Use different settings of `error_tracker` and test unique errors count.""" for error in [ @@ -31,12 +31,12 @@ def test_error_tracker_counts(error_tracker: ErrorTracker, expected_unique_error try: raise error # Errors raised on same line except Exception as e: # noqa:PERF203 - error_tracker.add(e) + await error_tracker.add(e) try: raise ValueError('Some value error abc') # Same as one previous error, but different line. except Exception as e: - error_tracker.add(e) + await error_tracker.add(e) assert error_tracker.total == 5 assert error_tracker.unique_error_count == expected_unique_errors @@ -50,7 +50,7 @@ def test_error_tracker_counts(error_tracker: ErrorTracker, expected_unique_error ('Some error number 0 0 0', 'Some error number 1 0 1', 'Some error number *** 0 ***'), ], ) -def test_error_tracker_similar_messages_full_stack( +async def test_error_tracker_similar_messages_full_stack( message_1: str, message_2: str, expected_generic_message: str ) -> None: """Test that similar messages collapse into same group with generic name that contains wildcard symbols.""" @@ -66,7 +66,7 @@ def test_error_tracker_similar_messages_full_stack( try: raise error # Errors raised on the same line except Exception as e: # noqa:PERF203 - error_tracker.add(e) + await error_tracker.add(e) line = traceback.extract_tb(e.__traceback__)[0].lineno file_name = __file__.split('/')[-1] @@ -86,7 +86,7 @@ def test_error_tracker_similar_messages_full_stack( (False, 'Error line 1'), ], ) -def test_show_full_message(*, show_full_message: bool, expected_message: str) -> None: +async def test_show_full_message(*, show_full_message: bool, expected_message: str) -> None: """Test error message settings with both options of `show_full_message`.""" error_tracker = ErrorTracker( show_error_name=False, show_file_and_line_number=False, show_full_message=show_full_message @@ -95,6 +95,6 @@ def test_show_full_message(*, show_full_message: bool, expected_message: str) -> try: raise RuntimeError('Error line 1\n Error line 2') # Errors raised on the same line except Exception as e: - error_tracker.add(e) + await error_tracker.add(e) assert error_tracker.get_most_common_errors()[0][0] == expected_message diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py index bcb2356a1c..7f00ff2166 100644 --- a/tests/unit/crawlers/_http/test_http_crawler.py +++ b/tests/unit/crawlers/_http/test_http_crawler.py @@ -10,6 +10,8 @@ from crawlee import ConcurrencySettings, Request from crawlee.crawlers import HttpCrawler from crawlee.sessions import SessionPool +from crawlee.statistics import Statistics +from tests.unit.server_endpoints import HELLO_WORLD if TYPE_CHECKING: from collections.abc import Awaitable @@ -522,3 +524,42 @@ async def handler(context: HttpCrawlingContext) -> None: def test_default_logger() -> None: assert HttpCrawler().log.name == 'HttpCrawler' + + +async def test_get_snapshot(server_url: URL) -> None: + crawler = HttpCrawler() + + snapshot = None + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + nonlocal snapshot + snapshot = await context.get_snapshot() + + await crawler.run([str(server_url)]) + + assert snapshot is not None + assert snapshot.html is not None + assert snapshot.html == HELLO_WORLD.decode('utf8') + + +async def test_error_snapshot_through_statistics(server_url: URL) -> None: + crawler = HttpCrawler(statistics=Statistics.with_default_state(save_error_snapshots=True)) + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + raise RuntimeError(rf'Exception /\ with file name unfriendly symbols in {context.request.url}') + + await crawler.run([str(server_url)]) + + kvs = await crawler.get_key_value_store() + kvs_content = {} + async for key_info in kvs.iterate_keys(): + kvs_content[key_info.key] = await kvs.get_value(key_info.key) + + # One error, three time retried. + assert crawler.statistics.error_tracker.total == 3 + assert crawler.statistics.error_tracker.unique_error_count == 1 + assert len(kvs_content) == 1 + assert key_info.key.endswith('.html') + assert kvs_content[key_info.key] == HELLO_WORLD.decode('utf8') diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index f31ccc8e47..9123c30904 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -23,6 +23,9 @@ from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD from crawlee.proxy_configuration import ProxyConfiguration from crawlee.sessions import SessionPool +from crawlee.statistics import Statistics +from crawlee.statistics._error_snapshotter import ErrorSnapshotter +from tests.unit.server_endpoints import GENERIC_RESPONSE, HELLO_WORLD if TYPE_CHECKING: from pathlib import Path @@ -492,3 +495,71 @@ async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None: assert fingerprints['window.navigator.userAgent'] assert 'headless' not in fingerprints['window.navigator.userAgent'].lower() + + +async def test_get_snapshot(server_url: URL) -> None: + crawler = PlaywrightCrawler() + + snapshot = None + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + nonlocal snapshot + snapshot = await context.get_snapshot() + + await crawler.run([str(server_url)]) + + assert snapshot is not None + assert snapshot.html is not None + assert snapshot.screenshot is not None + # Check at least jpeg start and end expected bytes. Content is not relevant for the test. + assert snapshot.screenshot.startswith(b'\xff\xd8') + assert snapshot.screenshot.endswith(b'\xff\xd9') + assert snapshot.html == HELLO_WORLD.decode('utf-8') + + +async def test_error_snapshot_through_statistics(server_url: URL) -> None: + """Test correct use of error snapshotter by the Playwright crawler. + + In this test the crawler will visit 4 pages. + - 2 x page endpoints will return the same error + - homepage endpoint will return unique error + - headers endpoint will return no error + """ + max_retries = 2 + crawler = PlaywrightCrawler( + statistics=Statistics.with_default_state(save_error_snapshots=True), max_request_retries=max_retries + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + if 'page' in context.request.url: + raise RuntimeError('page error') + if 'headers' in context.request.url: + return + raise RuntimeError('home error') + + await crawler.run( + [str(server_url), str(server_url / 'page_1'), str(server_url / 'page_2'), str(server_url / 'headers')] + ) + + kvs = await crawler.get_key_value_store() + kvs_content = {} + + async for key_info in kvs.iterate_keys(): + kvs_content[key_info.key] = await kvs.get_value(key_info.key) + + assert set(key_info.key).issubset(ErrorSnapshotter.ALLOWED_CHARACTERS) + if key_info.key.endswith('.jpg'): + # Check at least jpeg start and end expected bytes. Content is not relevant for the test. + assert kvs_content[key_info.key].startswith(b'\xff\xd8') + assert kvs_content[key_info.key].endswith(b'\xff\xd9') + elif 'page' in key_info.key: + assert kvs_content[key_info.key] == GENERIC_RESPONSE.decode('utf-8') + else: + assert kvs_content[key_info.key] == HELLO_WORLD.decode('utf-8') + + # Three errors twice retried errors, but only 2 unique -> 4 (2 x (html and jpg)) artifacts expected. + assert crawler.statistics.error_tracker.total == 3 * max_retries + assert crawler.statistics.error_tracker.unique_error_count == 2 + assert len(kvs_content) == 4 diff --git a/tests/unit/server.py b/tests/unit/server.py index 8a17fd3d7b..29e789d013 100644 --- a/tests/unit/server.py +++ b/tests/unit/server.py @@ -11,6 +11,8 @@ from uvicorn.server import Server from yarl import URL +from tests.unit.server_endpoints import GENERIC_RESPONSE, HELLO_WORLD, INCAPSULA, SECONDARY_INDEX, START_ENQUEUE + if TYPE_CHECKING: from socket import socket @@ -158,11 +160,7 @@ async def hello_world(send: Send) -> None: """Handle basic requests with a simple HTML response.""" await send_html_response( send, - b""" - - Hello, world! - - """, + HELLO_WORLD, ) @@ -256,15 +254,7 @@ async def start_enqueue_endpoint(send: Send) -> None: """Handle requests for the main page with links.""" await send_html_response( send, - b""" - - Hello - - - Link 1 - Link 2 - - """, + START_ENQUEUE, ) @@ -272,15 +262,7 @@ async def secondary_index_endpoint(send: Send) -> None: """Handle requests for the secondary page with links.""" await send_html_response( send, - b""" - - Hello - - - Link 3 - Link 4 - - """, + SECONDARY_INDEX, ) @@ -288,15 +270,7 @@ async def incapsula_endpoint(send: Send) -> None: """Handle requests for a page with an incapsula iframe.""" await send_html_response( send, - b""" - - Hello - - - - - """, + INCAPSULA, ) @@ -304,14 +278,7 @@ async def generic_response_endpoint(send: Send) -> None: """Handle requests with a generic HTML response.""" await send_html_response( send, - b""" - - Hello - - - Insightful content - - """, + GENERIC_RESPONSE, ) diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py new file mode 100644 index 0000000000..00456d3dcd --- /dev/null +++ b/tests/unit/server_endpoints.py @@ -0,0 +1,43 @@ +# Test server response content for testing + +HELLO_WORLD = b"""\ + + Hello, world! + + +""" + +START_ENQUEUE = b"""\ + + Hello + + + Link 1 + Link 2 +""" + +SECONDARY_INDEX = b"""\ + + Hello + + + Link 3 + Link 4 +""" + +INCAPSULA = b"""\ + + Hello + + + +""" + +GENERIC_RESPONSE = b"""\ + + Hello + + + Insightful content +"""