-
Notifications
You must be signed in to change notification settings - Fork 506
Labels
bugSomething isn't working.Something isn't working.t-toolingIssues with this label are in the ownership of the tooling team.Issues with this label are in the ownership of the tooling team.
Description
This crawler
class Crawler:
def __init__(self, storage_client: StorageClient):
self.storage_client = storage_client
async def crawl(
self,
seed_url: str,
purge_on_start: bool = False,
) -> None:
print(f"Crawling {seed_url} ...")
config = service_locator.get_configuration()
config.purge_on_start = purge_on_start
# service_locator.set_storage_client(storage_client=self.storage_client)
concurrency_settings = ConcurrencySettings(
min_concurrency=1,
max_concurrency=1,
desired_concurrency=1,
max_tasks_per_minute=200,
)
statistics = Statistics.with_default_state(log_interval=timedelta(minutes=1), statistics_log_format="table")
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
max_requests_per_crawl=10000,
playwright_crawler_specific_kwargs={
"browser_type": "chromium",
"headless": True,
},
max_session_rotations=10,
retry_on_blocked=True,
concurrency_settings=concurrency_settings,
statistics=statistics,
configure_logging=True,
use_session_pool=True,
max_request_retries=5,
request_handler_timeout=timedelta(seconds=30),
keep_alive=False,
respect_robots_txt_file=True,
storage_client=self.storage_client,
)
crawler.pre_navigation_hook(pre_navigation_hook)
crawler.router.default_handler(default_handler)
crawler.error_handler(error_handler)
crawler.failed_request_handler(failed_request_handler)
await crawler.run([seed_url])
logger.info("Finished crawling")
breaks when storage client is of type SqlStorageClient with error:
scripts/crawling/crawl_url.py:13: UserWarning: The SqlStorageClient is experimental and may change or be removed in future releases.
sql_client = SqlStorageClient(engine=engine)
Crawling http://httpbingo.org/status/429 ...
No configuration set, implicitly creating and using default Configuration.
Traceback (most recent call last):
File "crawl_url.py", line 21, in <module>
tyro.cli(crawl_entrypoint)
File ".venv/lib/python3.12/site-packages/tyro/_cli.py", line 238, in cli
return run_with_args_from_cli()
^^^^^^^^^^^^^^^^^^^^^^^^
File "scripts/crawling/crawl_url.py", line 17, in crawl_entrypoint
asyncio.run(crawler.crawl(url, True))
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/asyncio/runners.py", line 195, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/asyncio/base_events.py", line 691, in run_until_complete
return future.result()
^^^^^^^^^^^^^^^
File "crawler/main.py", line 41, in crawl
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".venv/lib/python3.12/site-packages/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", line 239, in with_beautifulsoup_static_parser
return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup, Tag](
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/typing.py", line 1184, in __call__
result = self.__origin__(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".venv/lib/python3.12/site-packages/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py", line 153, in __init__
basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 136, in deepcopy
y = copier(x, memo)
^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 221, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 162, in deepcopy
y = _reconstruct(x, memo, *rv)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 259, in _reconstruct
state = deepcopy(state, memo)
^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 136, in deepcopy
y = copier(x, memo)
^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 221, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 162, in deepcopy
y = _reconstruct(x, memo, *rv)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 259, in _reconstruct
state = deepcopy(state, memo)
^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 136, in deepcopy
y = copier(x, memo)
^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 201, in _deepcopy_tuple
y = [deepcopy(a, memo) for a in x]
^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 136, in deepcopy
y = copier(x, memo)
^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 221, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 162, in deepcopy
y = _reconstruct(x, memo, *rv)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 259, in _reconstruct
state = deepcopy(state, memo)
^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 136, in deepcopy
y = copier(x, memo)
^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 221, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 162, in deepcopy
y = _reconstruct(x, memo, *rv)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 259, in _reconstruct
state = deepcopy(state, memo)
^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 136, in deepcopy
y = copier(x, memo)
^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 221, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 162, in deepcopy
y = _reconstruct(x, memo, *rv)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 259, in _reconstruct
state = deepcopy(state, memo)
^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 136, in deepcopy
y = copier(x, memo)
^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 221, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 162, in deepcopy
y = _reconstruct(x, memo, *rv)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 259, in _reconstruct
state = deepcopy(state, memo)
^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 136, in deepcopy
y = copier(x, memo)
^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 221, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
^^^^^^^^^^^^^^^^^^^^^
File "-/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/copy.py", line 151, in deepcopy
rv = reductor(4)
^^^^^^^^^^^
TypeError: cannot pickle 'module' object
everything works fine if the storage_client is of type MemoryStorageClient.
We found a workaround using service_locator.set_storage_client(storage_client=self.storage_client) but using this makes testing quite tricky.
I am using Crawlee 1.0.2
Metadata
Metadata
Assignees
Labels
bugSomething isn't working.Something isn't working.t-toolingIssues with this label are in the ownership of the tooling team.Issues with this label are in the ownership of the tooling team.