Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ keywords = [
"scraping",
]
dependencies = [
"apify-client>=2.0.0,<3.0.0",
"apify-client>=2.2.0,<3.0.0",
"apify-shared>=2.0.0,<3.0.0",
"crawlee>=1.0.2,<2.0.0",
"cachetools>=5.5.0",
Expand Down
41 changes: 12 additions & 29 deletions src/apify/storage_clients/_apify/_dataset_client.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
from __future__ import annotations

import asyncio
import warnings
from logging import getLogger
from typing import TYPE_CHECKING, Any

from typing_extensions import override

from apify_client import ApifyClientAsync
from crawlee._utils.byte_size import ByteSize
from crawlee._utils.file import json_dumps
from crawlee.storage_clients._base import DatasetClient
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
from crawlee.storages import Dataset

from ._utils import AliasResolver
from ._utils import AliasResolver, create_apify_client

if TYPE_CHECKING:
from collections.abc import AsyncIterator
Expand Down Expand Up @@ -52,12 +52,17 @@ def __init__(
self._api_client = api_client
"""The Apify dataset client for API operations."""

self._api_public_base_url = api_public_base_url
"""The public base URL for accessing the key-value store records."""

self._lock = lock
"""A lock to ensure that only one operation is performed at a time."""

if api_public_base_url:
# Remove in version 4.0, https://github.com/apify/apify-sdk-python/issues/635
warnings.warn(
'api_public_base_url argument is deprecated and will be removed in version 4.0.0',
DeprecationWarning,
stacklevel=2,
)

@override
async def get_metadata(self) -> DatasetMetadata:
metadata = await self._api_client.get()
Expand Down Expand Up @@ -99,29 +104,7 @@ async def open(
if sum(1 for param in [id, name, alias] if param is not None) > 1:
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')

token = configuration.token
if not token:
raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')

api_url = configuration.api_base_url
if not api_url:
raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')

api_public_base_url = configuration.api_public_base_url
if not api_public_base_url:
raise ValueError(
'Apify storage client requires a valid API public base URL in Configuration '
f'(api_public_base_url={api_public_base_url}).'
)

# Create Apify client with the provided token and API URL.
apify_client_async = ApifyClientAsync(
token=token,
api_url=api_url,
max_retries=8,
min_delay_between_retries_millis=500,
timeout_secs=360,
)
apify_client_async = create_apify_client(configuration)
apify_datasets_client = apify_client_async.datasets()

# Normalize unnamed default storage in cases where not defined in `configuration.default_dataset_id` to unnamed
Expand Down Expand Up @@ -178,7 +161,7 @@ async def open(

return cls(
api_client=apify_dataset_client,
api_public_base_url=api_public_base_url,
api_public_base_url='', # Remove in version 4.0, https://github.com/apify/apify-sdk-python/issues/635
lock=asyncio.Lock(),
)

Expand Down
56 changes: 13 additions & 43 deletions src/apify/storage_clients/_apify/_key_value_store_client.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,18 @@
from __future__ import annotations

import asyncio
import warnings
from logging import getLogger
from typing import TYPE_CHECKING, Any

from typing_extensions import override
from yarl import URL

from apify_client import ApifyClientAsync
from crawlee.storage_clients._base import KeyValueStoreClient
from crawlee.storage_clients.models import KeyValueStoreRecord, KeyValueStoreRecordMetadata
from crawlee.storages import KeyValueStore

from ._models import ApifyKeyValueStoreMetadata, KeyValueStoreListKeysPage
from ._utils import AliasResolver
from apify._crypto import create_hmac_signature
from ._utils import AliasResolver, create_apify_client

if TYPE_CHECKING:
from collections.abc import AsyncIterator
Expand Down Expand Up @@ -43,12 +41,17 @@ def __init__(
self._api_client = api_client
"""The Apify KVS client for API operations."""

self._api_public_base_url = api_public_base_url
"""The public base URL for accessing the key-value store records."""

self._lock = lock
"""A lock to ensure that only one operation is performed at a time."""

if api_public_base_url:
# Remove in version 4.0, https://github.com/apify/apify-sdk-python/issues/635
warnings.warn(
'api_public_base_url argument is deprecated and will be removed in version 4.0.0',
DeprecationWarning,
stacklevel=2,
)

@override
async def get_metadata(self) -> ApifyKeyValueStoreMetadata:
metadata = await self._api_client.get()
Expand Down Expand Up @@ -90,29 +93,7 @@ async def open(
if sum(1 for param in [id, name, alias] if param is not None) > 1:
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')

token = configuration.token
if not token:
raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')

api_url = configuration.api_base_url
if not api_url:
raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')

api_public_base_url = configuration.api_public_base_url
if not api_public_base_url:
raise ValueError(
'Apify storage client requires a valid API public base URL in Configuration '
f'(api_public_base_url={api_public_base_url}).'
)

# Create Apify client with the provided token and API URL.
apify_client_async = ApifyClientAsync(
token=token,
api_url=api_url,
max_retries=8,
min_delay_between_retries_millis=500,
timeout_secs=360,
)
apify_client_async = create_apify_client(configuration)
apify_kvss_client = apify_client_async.key_value_stores()

# Normalize unnamed default storage in cases where not defined in `configuration.default_key_value_store_id` to
Expand Down Expand Up @@ -170,7 +151,7 @@ async def open(

return cls(
api_client=apify_kvs_client,
api_public_base_url=api_public_base_url,
api_public_base_url='', # Remove in version 4.0, https://github.com/apify/apify-sdk-python/issues/635
lock=asyncio.Lock(),
)

Expand Down Expand Up @@ -251,15 +232,4 @@ async def get_public_url(self, key: str) -> str:
Returns:
A public URL that can be used to access the value of the given key in the KVS.
"""
if self._api_client.resource_id is None:
raise ValueError('resource_id cannot be None when generating a public URL')

public_url = (
URL(self._api_public_base_url) / 'v2' / 'key-value-stores' / self._api_client.resource_id / 'records' / key
)
metadata = await self.get_metadata()

if metadata.url_signing_secret_key is not None:
public_url = public_url.with_query(signature=create_hmac_signature(metadata.url_signing_secret_key, key))

return str(public_url)
return await self._api_client.get_record_public_url(key=key)
27 changes: 2 additions & 25 deletions src/apify/storage_clients/_apify/_request_queue_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from typing_extensions import override

from apify_client import ApifyClientAsync
from crawlee._utils.crypto import crypto_random_object_id
from crawlee.storage_clients._base import RequestQueueClient
from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
Expand All @@ -14,7 +13,7 @@
from ._models import ApifyRequestQueueMetadata, RequestQueueStats
from ._request_queue_shared_client import ApifyRequestQueueSharedClient
from ._request_queue_single_client import ApifyRequestQueueSingleClient
from ._utils import AliasResolver
from ._utils import AliasResolver, create_apify_client

if TYPE_CHECKING:
from collections.abc import Sequence
Expand Down Expand Up @@ -228,29 +227,7 @@ async def open(
if sum(1 for param in [id, name, alias] if param is not None) > 1:
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')

token = configuration.token
if not token:
raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')

api_url = configuration.api_base_url
if not api_url:
raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')

api_public_base_url = configuration.api_public_base_url
if not api_public_base_url:
raise ValueError(
'Apify storage client requires a valid API public base URL in Configuration '
f'(api_public_base_url={api_public_base_url}).'
)

# Create Apify client with the provided token and API URL.
apify_client_async = ApifyClientAsync(
token=token,
api_url=api_url,
max_retries=8,
min_delay_between_retries_millis=500,
timeout_secs=360,
)
apify_client_async = create_apify_client(configuration)
apify_rqs_client = apify_client_async.request_queues()

# Normalize unnamed default storage in cases where not defined in `configuration.default_request_queue_id` to
Expand Down
27 changes: 27 additions & 0 deletions src/apify/storage_clients/_apify/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,30 @@ def unique_key_to_request_id(unique_key: str, *, request_id_length: int = 15) ->

# Truncate the key to the desired length
return url_safe_key[:request_id_length]


def create_apify_client(configuration: Configuration) -> ApifyClientAsync:
"""Create and return an ApifyClientAsync instance using the provided configuration."""
if not configuration.token:
raise ValueError(f'Apify storage client requires a valid token in Configuration (token={configuration.token}).')

api_url = configuration.api_base_url
if not api_url:
raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')

api_public_base_url = configuration.api_public_base_url
if not api_public_base_url:
raise ValueError(
'Apify storage client requires a valid API public base URL in Configuration '
f'(api_public_base_url={api_public_base_url}).'
)

# Create Apify client with the provided token and API URL.
return ApifyClientAsync(
token=configuration.token,
api_url=api_url,
api_public_url=api_public_base_url,
max_retries=8,
min_delay_between_retries_millis=500,
timeout_secs=360,
)
8 changes: 4 additions & 4 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.