Skip to content

Crawler's storage_client errors when using SqlStorageClient #1495

@ericvg97

Description

@ericvg97

This crawler

class Crawler:
    def __init__(self, storage_client: StorageClient):
        self.storage_client = storage_client

    async def crawl(
        self,
        seed_url: str,
        purge_on_start: bool = False,
    ) -> None:
        print(f"Crawling {seed_url} ...")

        config = service_locator.get_configuration()
        config.purge_on_start = purge_on_start
        # service_locator.set_storage_client(storage_client=self.storage_client)

        concurrency_settings = ConcurrencySettings(
            min_concurrency=1,
            max_concurrency=1,
            desired_concurrency=1,
            max_tasks_per_minute=200,
        )
        statistics = Statistics.with_default_state(log_interval=timedelta(minutes=1), statistics_log_format="table")

        crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
            max_requests_per_crawl=10000,
            playwright_crawler_specific_kwargs={
                "browser_type": "chromium",
                "headless": True,
            },
            max_session_rotations=10,
            retry_on_blocked=True,
            concurrency_settings=concurrency_settings,
            statistics=statistics,
            configure_logging=True,
            use_session_pool=True,
            max_request_retries=5,
            request_handler_timeout=timedelta(seconds=30),
            keep_alive=False,
            respect_robots_txt_file=True,
            storage_client=self.storage_client,
        )

        crawler.pre_navigation_hook(pre_navigation_hook)
        crawler.router.default_handler(default_handler)
        crawler.error_handler(error_handler)
        crawler.failed_request_handler(failed_request_handler)

        await crawler.run([seed_url])

        logger.info("Finished crawling")

breaks when storage client is of type SqlStorageClient with error:

scripts/crawling/crawl_url.py:13: UserWarning: The SqlStorageClient is experimental and may change or be removed in future releases.
  sql_client = SqlStorageClient(engine=engine)
Crawling http://httpbingo.org/status/429 ...
No configuration set, implicitly creating and using default Configuration.
Traceback (most recent call last):
  File "crawl_url.py", line 21, in <module>
    tyro.cli(crawl_entrypoint)
  File ".venv/lib/python3.12/site-packages/tyro/_cli.py", line 238, in cli
    return run_with_args_from_cli()
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "scripts/crawling/crawl_url.py", line 17, in crawl_entrypoint
    asyncio.run(crawler.crawl(url, True))
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/asyncio/runners.py", line 195, in run
    return runner.run(main)
           ^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/asyncio/base_events.py", line 691, in run_until_complete
    return future.result()
           ^^^^^^^^^^^^^^^
  File "crawler/main.py", line 41, in crawl
    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File ".venv/lib/python3.12/site-packages/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", line 239, in with_beautifulsoup_static_parser
    return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup, Tag](
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/typing.py", line 1184, in __call__
    result = self.__origin__(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File ".venv/lib/python3.12/site-packages/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", line 153, in __init__
    basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
                                              ^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 136, in deepcopy
    y = copier(x, memo)
        ^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 221, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
                             ^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 162, in deepcopy
    y = _reconstruct(x, memo, *rv)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 259, in _reconstruct
    state = deepcopy(state, memo)
            ^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 136, in deepcopy
    y = copier(x, memo)
        ^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 221, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
                             ^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 162, in deepcopy
    y = _reconstruct(x, memo, *rv)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 259, in _reconstruct
    state = deepcopy(state, memo)
            ^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 136, in deepcopy
    y = copier(x, memo)
        ^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 201, in _deepcopy_tuple
    y = [deepcopy(a, memo) for a in x]
         ^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 136, in deepcopy
    y = copier(x, memo)
        ^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 221, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
                             ^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 162, in deepcopy
    y = _reconstruct(x, memo, *rv)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 259, in _reconstruct
    state = deepcopy(state, memo)
            ^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 136, in deepcopy
    y = copier(x, memo)
        ^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 221, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
                             ^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 162, in deepcopy
    y = _reconstruct(x, memo, *rv)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 259, in _reconstruct
    state = deepcopy(state, memo)
            ^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 136, in deepcopy
    y = copier(x, memo)
        ^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 221, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
                             ^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 162, in deepcopy
    y = _reconstruct(x, memo, *rv)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 259, in _reconstruct
    state = deepcopy(state, memo)
            ^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 136, in deepcopy
    y = copier(x, memo)
        ^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 221, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
                             ^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 162, in deepcopy
    y = _reconstruct(x, memo, *rv)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 259, in _reconstruct
    state = deepcopy(state, memo)
            ^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 136, in deepcopy
    y = copier(x, memo)
        ^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 221, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
                             ^^^^^^^^^^^^^^^^^^^^^
  File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 151, in deepcopy
    rv = reductor(4)
         ^^^^^^^^^^^
TypeError: cannot pickle 'module' object

everything works fine if the storage_client is of type MemoryStorageClient.

We found a workaround using service_locator.set_storage_client(storage_client=self.storage_client) but using this makes testing quite tricky.

I am using Crawlee 1.0.2

Metadata

Metadata

Assignees

Labels

bugSomething isn't working.t-toolingIssues with this label are in the ownership of the tooling team.

Type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions