apify · vdusek · Apr 7, 2025 · Mar 24, 2025 · Mar 24, 2025 · Mar 27, 2025
diff --git a/docs/examples/capturing_page_snapshots_with_error_snapshotter.mdx b/docs/examples/capturing_page_snapshots_with_error_snapshotter.mdx
@@ -0,0 +1,27 @@
+---
+id: capturing-page-snapshots-with-error-snapshotter
+title: Capturing page snapshots with ErrorSnapshotter
+description: How to capture page snapshots on errors.
+---
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
+import ApiLink from '@site/src/components/ApiLink';
+import ParselCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/parsel_crawler_with_error_snapshotter.py';
+import PlaywrightCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_error_snapshotter.py';
+
+
+This example demonstrates how to capture page snapshots on first occurrence of each unique error. The capturing happens automatically if you set `save_error_snapshots=True` in the crawler's <ApiLink to="class/Statistics">`Statistics`</ApiLink>. The error snapshot can contain `html` file and `jpeg` file that are created from the page where the unhandled exception was raised. Captured error snapshot files are saved to the default key-value store. Both <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and [HTTP crawlers](../guides/http-crawlers) are capable of capturing the html file, but only <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is able to capture page screenshot as well.
+
+<Tabs>
+    <TabItem value="ParselCrawler" label="ParselCrawler">
+        <RunnableCodeBlock className="language-python" language="python">
+            { ParselCrawlerWithErrorSnapshotter }
+        </RunnableCodeBlock>
+    </TabItem>
+    <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
+        <RunnableCodeBlock className="language-python" language="python">
+            { PlaywrightCrawlerWithErrorSnapshotter }
+        </RunnableCodeBlock>
+    </TabItem>
+</Tabs>
diff --git a/docs/examples/code_examples/parsel_crawler_with_error_snapshotter.py b/docs/examples/code_examples/parsel_crawler_with_error_snapshotter.py
@@ -0,0 +1,31 @@
+import asyncio
+from random import choice
+
+from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
+from crawlee.statistics import Statistics
+
+
+async def main() -> None:
+    crawler = ParselCrawler(
+        statistics=Statistics.with_default_state(save_error_snapshots=True)
+    )
+
+    @crawler.router.default_handler
+    async def request_handler(context: ParselCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+        # Simulate various errors to demonstrate `ErrorSnapshotter`
+        # saving only the first occurrence of unique error.
+        await context.enqueue_links()
+        random_number = choice(range(10))
+        if random_number == 1:
+            raise KeyError('Some KeyError')
+        if random_number == 2:
+            raise ValueError('Some ValueError')
+        if random_number == 3:
+            raise RuntimeError('Some RuntimeError')
+
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/examples/code_examples/playwright_crawler_with_error_snapshotter.py b/docs/examples/code_examples/playwright_crawler_with_error_snapshotter.py
@@ -0,0 +1,31 @@
+import asyncio
+from random import choice
+
+from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
+from crawlee.statistics import Statistics
+
+
+async def main() -> None:
+    crawler = PlaywrightCrawler(
+        statistics=Statistics.with_default_state(save_error_snapshots=True)
+    )
+
+    @crawler.router.default_handler
+    async def request_handler(context: PlaywrightCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+        # Simulate various errors to demonstrate `ErrorSnapshotter`
+        # saving only the first occurrence of unique error.
+        await context.enqueue_links()
+        random_number = choice(range(10))
+        if random_number == 1:
+            raise KeyError('Some KeyError')
+        if random_number == 2:
+            raise ValueError('Some ValueError')
+        if random_number == 3:
+            raise RuntimeError('Some RuntimeError')
+
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/examples/crawl_specific_links_on_website.mdx b/docs/examples/crawl_specific_links_on_website.mdx
@@ -35,12 +35,12 @@ This example demonstrates how to crawl a website while targeting specific patter
 
 <Tabs groupId="second-example">
     <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
-        <RunnableCodeBlock className="language-python">
+        <RunnableCodeBlock className="language-python" language="python">
             {BeautifulSoupExampleExtractAndAdd}
         </RunnableCodeBlock>
     </TabItem>
     <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
-        <RunnableCodeBlock className="language-python">
+        <RunnableCodeBlock className="language-python" language="python">
             {PlaywrightExampleExtractAndAdd}
         </RunnableCodeBlock>
     </TabItem>

diff --git a/pyproject.toml b/pyproject.toml
@@ -179,6 +179,11 @@ indent-style = "space"
     "F841",   # Local variable {variable} is assigned to but never used
     "N999",   # Invalid module name
 ]
+"**/docs/examples/code_examples/*crawler_with_error_snapshotter.py" = [
+    "PLR2004", # Magic value used in comparison. Ignored for simplicity and readability of example code.
+]
+
+
 
 [tool.ruff.lint.flake8-quotes]
 docstring-quotes = "double"

diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import dataclasses
 from collections.abc import Iterator, Mapping
 from dataclasses import dataclass
 from enum import Enum
@@ -559,6 +560,33 @@ def __call__(
         """
 
 
+@docs_group('Data structures')
+@dataclasses.dataclass
+class PageSnapshot:
+    """Snapshot of a crawled page."""
+
+    screenshot: bytes | None = None
+    """Screenshot of the page format."""
+
+    html: str | None = None
+    """HTML content of the page."""
+
+    def __bool__(self) -> bool:
+        return bool(self.screenshot or self.html)
+
+
+@docs_group('Functions')
+class GetPageSnapshot(Protocol):
+    """A function for getting snapshot of a page."""
+
+    def __call__(self) -> Coroutine[None, None, PageSnapshot]:
+        """Get page snapshot.
+
+        Returns:
+            Snapshot of a page.
+        """
+
+
 @docs_group('Functions')
 class UseStateFunction(Protocol):
     """A function for managing state within the crawling context.
@@ -619,6 +647,10 @@ class BasicCrawlingContext:
     log: logging.Logger
     """Logger instance."""
 
+    async def get_snapshot(self) -> PageSnapshot:
+        """Get snapshot of crawled page."""
+        return PageSnapshot()
+
     def __hash__(self) -> int:
         """Return hash of the context. Each context is considered unique."""
         return id(self)
diff --git a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py
@@ -5,7 +5,7 @@
 
 from typing_extensions import Self, TypeVar
 
-from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, ExtractLinksFunction
+from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, ExtractLinksFunction, PageSnapshot
 from crawlee._utils.docs import docs_group
 from crawlee.http_clients import HttpCrawlingResult, HttpResponse
 
@@ -24,6 +24,10 @@ def from_basic_crawling_context(cls, context: BasicCrawlingContext, http_respons
         context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}
         return cls(http_response=http_response, **context_kwargs)
 
+    async def get_snapshot(self) -> PageSnapshot:
+        """Get snapshot of crawled page."""
+        return PageSnapshot(html=self.http_response.read().decode('utf-8'))
+
 
 @dataclass(frozen=True)
 @docs_group('Data structures')

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -892,7 +892,7 @@ async def _handle_request_retries(
 
         if self._should_retry_request(context, error):
             request.retry_count += 1
-            self._statistics.error_tracker.add(error)
+            await self._statistics.error_tracker.add(error=error, context=context)
 
             if self._error_handler:
                 try:
@@ -946,7 +946,7 @@ async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingC
 
     async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:
         self._logger.exception('Request failed and reached maximum retries', exc_info=error)
-        self._statistics.error_tracker.add(error)
+        await self._statistics.error_tracker.add(error=error, context=context)
 
         if self._failed_request_handler:
             try:
@@ -1162,7 +1162,7 @@ async def __run_task_function(self) -> None:
                 context.request.session_rotation_count += 1
 
                 await request_manager.reclaim_request(request)
-                self._statistics.error_tracker_retry.add(session_error)
+                await self._statistics.error_tracker_retry.add(error=session_error, context=context)
             else:
                 self._logger.exception('Request failed and reached maximum retries', exc_info=session_error)
 
@@ -1176,7 +1176,7 @@ async def __run_task_function(self) -> None:
                 )
 
                 self._statistics.record_request_processing_failure(statistics_id)
-                self._statistics.error_tracker.add(session_error)
+                await self._statistics.error_tracker.add(error=session_error, context=context)
 
         except ContextPipelineInterruptedError as interrupted_error:
             self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)

diff --git a/src/crawlee/crawlers/_basic/_context_pipeline.py b/src/crawlee/crawlers/_basic/_context_pipeline.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
-from collections.abc import AsyncGenerator, Awaitable, Generator
-from typing import Any, Callable, Generic, cast
+from typing import TYPE_CHECKING, Any, Callable, Generic, cast
 
 from typing_extensions import TypeVar
 
@@ -15,6 +14,9 @@
     SessionError,
 )
 
+if TYPE_CHECKING:
+    from collections.abc import AsyncGenerator, Awaitable, Generator
+
 TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
 TMiddlewareCrawlingContext = TypeVar('TMiddlewareCrawlingContext', bound=BasicCrawlingContext)
 
@@ -31,7 +33,7 @@ def __init__(
         *,
         _middleware: Callable[
             [TCrawlingContext],
-            AsyncGenerator[TMiddlewareCrawlingContext, None],
+            AsyncGenerator[TMiddlewareCrawlingContext, Exception | None],
         ]
         | None = None,
         _parent: ContextPipeline[BasicCrawlingContext] | None = None,
@@ -55,7 +57,8 @@ async def __call__(
         Exceptions from the consumer function are wrapped together with the final crawling context.
         """
         chain = list(self._middleware_chain())
-        cleanup_stack = list[AsyncGenerator]()
+        cleanup_stack: list[AsyncGenerator[Any, Exception | None]] = []
+        final_consumer_exception: Exception | None = None
 
         try:
             for member in reversed(chain):
@@ -77,14 +80,16 @@ async def __call__(
 
             try:
                 await final_context_consumer(cast('TCrawlingContext', crawling_context))
-            except SessionError:  # Session errors get special treatment
+            except SessionError as e:  # Session errors get special treatment
+                final_consumer_exception = e
                 raise
             except Exception as e:
+                final_consumer_exception = e
                 raise RequestHandlerError(e, crawling_context) from e
         finally:
             for middleware_instance in reversed(cleanup_stack):
                 try:
-                    result = await middleware_instance.__anext__()
+                    result = await middleware_instance.asend(final_consumer_exception)
                 except StopAsyncIteration:  # noqa: PERF203
                     pass
                 except ContextPipelineInterruptedError as e:
@@ -111,7 +116,8 @@ def compose(
         """
         return ContextPipeline[TMiddlewareCrawlingContext](
             _middleware=cast(
-                'Callable[[BasicCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, None]]', middleware
+                'Callable[[BasicCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, Exception | None]]',
+                middleware,
             ),
             _parent=cast('ContextPipeline[BasicCrawlingContext]', self),
         )
diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -202,7 +202,7 @@ async def _open_page(
     async def _navigate(
         self,
         context: PlaywrightPreNavCrawlingContext,
-    ) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
+    ) -> AsyncGenerator[PlaywrightCrawlingContext, Exception | None]:
         """Execute an HTTP request utilizing the `BrowserPool` and the `Playwright` library.
 
         Args:
@@ -238,7 +238,7 @@ async def _navigate(
 
             extract_links = self._create_extract_links_function(context)
 
-            yield PlaywrightCrawlingContext(
+            error = yield PlaywrightCrawlingContext(
                 request=context.request,
                 session=context.session,
                 add_requests=context.add_requests,
@@ -251,11 +251,15 @@ async def _navigate(
                 page=context.page,
                 infinite_scroll=lambda: infinite_scroll(context.page),
                 response=response,
-                enqueue_links=self._create_enqueue_links_function(context, extract_links),
                 extract_links=extract_links,
+                enqueue_links=self._create_enqueue_links_function(context, extract_links),
                 block_requests=partial(block_requests, page=context.page),
             )
 
+            # Collect data in case of errors, before the page object is closed.
+            if error:
+                await self.statistics.error_tracker.add(error=error, context=context, early=True)
+
     def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContext) -> ExtractLinksFunction:
         """Create a callback function for extracting links from context.
 

diff --git a/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py b/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py
@@ -3,7 +3,7 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
-from crawlee._types import BasicCrawlingContext
+from crawlee._types import BasicCrawlingContext, PageSnapshot
 from crawlee._utils.docs import docs_group
 
 if TYPE_CHECKING:
@@ -25,3 +25,20 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
 
     block_requests: BlockRequestsFunction
     """Blocks network requests matching specified URL patterns."""
+
+    async def get_snapshot(self) -> PageSnapshot:
+        """Get snapshot of crawled page."""
+        html = None
+        screenshot = None
+
+        try:
+            html = await self.page.content()
+        except Exception:
+            self.log.exception(f'Failed to get html snapshot for {self.request.url}.')
+
+        try:
+            screenshot = await self.page.screenshot(full_page=True, type='jpeg')
+        except Exception:
+            self.log.exception(f'Failed to get page screenshot for {self.request.url}.')
+
+        return PageSnapshot(html=html, screenshot=screenshot)