diff --git a/internetarchive/__init__.py b/internetarchive/__init__.py index 3a1e9df3..8936a875 100644 --- a/internetarchive/__init__.py +++ b/internetarchive/__init__.py @@ -70,8 +70,8 @@ 'configure', 'delete', 'download', - 'get_files', # API. + 'get_files', 'get_item', 'get_session', 'get_tasks', diff --git a/internetarchive/api.py b/internetarchive/api.py index f97d5d41..0c2bdacc 100644 --- a/internetarchive/api.py +++ b/internetarchive/api.py @@ -31,11 +31,13 @@ from typing import Iterable, Mapping, MutableMapping import requests +from requests import PreparedRequest, Request, Response from urllib3 import Retry from internetarchive import auth, catalog, files, item, search, session from internetarchive import config as config_module from internetarchive.exceptions import AuthenticationError +from internetarchive.files import File def get_session( @@ -105,7 +107,7 @@ def get_item( :py:class:`requests.adapters.HTTPAdapter` takes. :param request_kwargs: Keyword arguments that - :py:class:`requests.Request` takes. + :py:class:`Request` takes. :returns: The Item that fits the criteria. @@ -119,16 +121,17 @@ def get_item( archive_session = get_session(config, config_file, debug, http_adapter_kwargs) return archive_session.get_item(identifier, request_kwargs=request_kwargs) - +# mypy is confused +# mypy: disable-error-code="return-value" def get_files( - identifier: str, - files: files.File | list[files.File] | None = None, - formats: str | list[str] | None = None, - glob_pattern: str | None = None, - exclude_pattern: str | None = None, - on_the_fly: bool = False, - **get_item_kwargs, -) -> list[files.File]: + identifier: str, + files: files.File | list[files.File] | None = None, + formats: str | list[str] | None = None, + glob_pattern: str | None = None, + exclude_pattern: str | None = None, + on_the_fly: bool = False, + **get_item_kwargs, + ) -> list[File]: r"""Get :class:`File` objects from an item. :param identifier: The globally unique Archive.org identifier for a given item. @@ -157,7 +160,6 @@ def get_files( item = get_item(identifier, **get_item_kwargs) return item.get_files(files, formats, glob_pattern, exclude_pattern, on_the_fly) - def modify_metadata( identifier: str, metadata: Mapping, @@ -168,9 +170,9 @@ def modify_metadata( access_key: str | None = None, secret_key: str | None = None, debug: bool = False, - request_kwargs: Mapping | None = None, + request_kwargs: dict | None = None, **get_item_kwargs, -) -> requests.Request | requests.Response: +) -> PreparedRequest | Response | Request : r"""Modify the metadata of an existing item on Archive.org. :param identifier: The globally unique Archive.org identifier for a given item. @@ -191,7 +193,7 @@ def modify_metadata( :param secret_key: IA-S3 secret_key to use when making the given request. - :param debug: set to True to return a :class:`requests.Request ` + :param debug: set to True to return a :class:`Request ` object instead of sending request. Defaults to ``False``. :param \*\*get_item_kwargs: Arguments that ``get_item`` takes. @@ -214,24 +216,24 @@ def modify_metadata( def upload( - identifier: str, - files, - metadata: Mapping | None = None, - headers: dict | None = None, - access_key: str | None = None, - secret_key: str | None = None, - queue_derive=None, - verbose: bool = False, - verify: bool = False, - checksum: bool = False, - delete: bool = False, - retries: int | None = None, - retries_sleep: int | None = None, - debug: bool = False, - validate_identifier: bool = False, - request_kwargs: dict | None = None, - **get_item_kwargs, -) -> list[requests.Request | requests.Response]: + identifier: str, + files, + metadata: Mapping | None = None, + headers: dict | None = None, + access_key: str | None = None, + secret_key: str | None = None, + queue_derive=None, + verbose: bool = False, + verify: bool = False, + checksum: bool = False, + delete: bool = False, + retries: int | None = None, + retries_sleep: int | None = None, + debug: bool = False, + validate_identifier: bool = False, + request_kwargs: dict | None = None, + **get_item_kwargs, + ) -> list[PreparedRequest | Response | Request]: r"""Upload files to an item. The item will be created if it does not exist. :param identifier: The globally unique Archive.org identifier for a given item. @@ -297,26 +299,26 @@ def upload( def download( - identifier: str, - files: files.File | list[files.File] | None = None, - formats: str | list[str] | None = None, - glob_pattern: str | None = None, - dry_run: bool = False, - verbose: bool = False, - ignore_existing: bool = False, - checksum: bool = False, - checksum_archive: bool = False, - destdir: str | None = None, - no_directory: bool = False, - retries: int | None = None, - item_index: int | None = None, - ignore_errors: bool = False, - on_the_fly: bool = False, - return_responses: bool = False, - no_change_timestamp: bool = False, - timeout: float | tuple[int, float] | None = None, - **get_item_kwargs, -) -> list[requests.Request | requests.Response]: + identifier: str, + files: files.File | list[files.File] | None = None, + formats: str | list[str] | None = None, + glob_pattern: str | None = None, + dry_run: bool = False, + verbose: bool = False, + ignore_existing: bool = False, + checksum: bool = False, + checksum_archive: bool = False, + destdir: str | None = None, + no_directory: bool = False, + retries: int | None = None, + item_index: int | None = None, + ignore_errors: bool = False, + on_the_fly: bool = False, + return_responses: bool = False, + no_change_timestamp: bool = False, + timeout: float | tuple[int, float] | None = None, + **get_item_kwargs, + ) -> list[Response | Request]: r"""Download files from an item. :param identifier: The globally unique Archive.org identifier for a given item. @@ -398,7 +400,7 @@ def delete( verbose: bool = False, debug: bool = False, **kwargs, -) -> list[requests.Request | requests.Response]: +) -> list[Response | Request]: """Delete files from an item. Note: Some system files, such as _meta.xml, cannot be deleted. @@ -466,16 +468,16 @@ def get_tasks( def search_items( query: str, - fields: Iterable | None = None, + fields: list | None = None, sorts=None, - params: Mapping | None = None, + params: dict | None = None, full_text_search: bool = False, dsl_fts: bool = False, archive_session: session.ArchiveSession | None = None, config: Mapping | None = None, config_file: str | None = None, http_adapter_kwargs: MutableMapping | None = None, - request_kwargs: Mapping | None = None, + request_kwargs: dict | None = None, max_retries: int | Retry | None = None, ) -> search.Search: """Search for items on Archive.org. @@ -504,7 +506,7 @@ def search_items( :py:class:`requests.adapters.HTTPAdapter` takes. :param request_kwargs: Keyword arguments that - :py:class:`requests.Request` takes. + :py:class:`Request` takes. :param max_retries: The number of times to retry a failed request. This can also be an `urllib3.Retry` object. diff --git a/internetarchive/item.py b/internetarchive/item.py index 891c4e90..778426e9 100644 --- a/internetarchive/item.py +++ b/internetarchive/item.py @@ -34,7 +34,7 @@ from functools import total_ordering from logging import getLogger from time import sleep -from typing import Mapping, MutableMapping, Optional +from typing import Generator, Iterable, Mapping, MutableMapping, Optional, SupportsComplex from urllib.parse import quote from xml.parsers.expat import ExpatError @@ -46,6 +46,7 @@ from internetarchive.auth import S3Auth from internetarchive.files import File from internetarchive.iarequest import MetadataRequest, S3Request +from internetarchive.session import ArchiveSession from internetarchive.utils import ( IdentifierListAsItems, IterableToFileAdapter, @@ -70,9 +71,9 @@ class BaseItem: def __init__( self, - identifier: str | None = None, + identifier: str = "", item_metadata: Mapping | None = None, - ): + ) -> None: # Default attributes. self.identifier = identifier self.item_metadata = item_metadata or {} @@ -101,7 +102,10 @@ def __repr__(self) -> str: notloaded = ', item_metadata={}' if not self.exists else '' return f'{self.__class__.__name__}(identifier={self.identifier!r}{notloaded})' - def load(self, item_metadata: Mapping | None = None) -> None: + def load( + self, + item_metadata: Mapping | None = None + ) -> None: if item_metadata: self.item_metadata = item_metadata @@ -111,11 +115,7 @@ def load(self, item_metadata: Mapping | None = None) -> None: setattr(self, key, self.item_metadata[key]) if not self.identifier: - self.identifier = self.metadata.get('identifier') - - mc = self.metadata.get('collection', []) - # TODO: The `type: ignore` on the following line should be removed. See #518 - self.collection = IdentifierListAsItems(mc, self.session) # type: ignore + self.identifier = str(self.metadata.get('identifier')) def __eq__(self, other) -> bool: return (self.item_metadata == other.item_metadata @@ -164,11 +164,11 @@ class Item(BaseItem): """ def __init__( - self, - archive_session, - identifier: str, - item_metadata: Mapping | None = None, - ): + self, + archive_session: ArchiveSession, + identifier: str = "", + item_metadata: Mapping | None = None, + ) -> None: """ :param archive_session: :class:`ArchiveSession ` @@ -194,6 +194,9 @@ def __init__( self.urls = Item.URLs(self) + mc = self.metadata.get('collection', []) + self.collection = IdentifierListAsItems(mc, self.session) + if self.metadata.get('title'): # A copyable link to the item, in MediaWiki format details = self.urls.details # type: ignore @@ -221,14 +224,22 @@ def _make_tab_URL(self, tab: str) -> None: DEFAULT_URL_FORMAT = ('{0.session.protocol}//{0.session.host}' '/{path}/{0.identifier}') - def _make_URL(self, path: str, url_format: str = DEFAULT_URL_FORMAT) -> None: + def _make_URL( + self, + path: str, + url_format: str = DEFAULT_URL_FORMAT + ) -> None: setattr(self, path, url_format.format(self._itm_obj, path=path)) self._paths.append(path) def __str__(self) -> str: return f'URLs ({", ".join(self._paths)}) for {self._itm_obj.identifier}' - def refresh(self, item_metadata: Mapping | None = None, **kwargs) -> None: + def refresh( + self, + item_metadata: dict | None = None, + **kwargs: dict | None + ) -> None: if not item_metadata: item_metadata = self.session.get_metadata(self.identifier, **kwargs) self.load(item_metadata) @@ -247,10 +258,10 @@ def identifier_available(self) -> bool: return availability == 'available' def get_task_summary( - self, - params: Mapping | None = None, - request_kwargs: Mapping | None = None, - ) -> dict: + self, + params: dict | None = None, + request_kwargs: dict | None = None, + ) -> dict: """Get a summary of the item's pending tasks. :param params: Params to send with your request. @@ -260,10 +271,10 @@ def get_task_summary( return self.session.get_tasks_summary(self.identifier, params, request_kwargs) def no_tasks_pending( - self, - params: Mapping | None = None, - request_kwargs: Mapping | None = None, - ) -> bool: + self, + params: dict | None = None, + request_kwargs: dict | None = None, + ) -> bool: """Check if there is any pending task for the item. :param params: Params to send with your request. @@ -273,10 +284,10 @@ def no_tasks_pending( return all(x == 0 for x in self.get_task_summary(params, request_kwargs).values()) def get_all_item_tasks( - self, - params: dict | None = None, - request_kwargs: Mapping | None = None, - ) -> list[catalog.CatalogTask]: + self, + params: dict | None = None, + request_kwargs: dict | None = None, + ) -> set[catalog.CatalogTask]: """Get a list of all tasks for the item, pending and complete. :param params: Query parameters, refer to @@ -293,11 +304,10 @@ def get_all_item_tasks( params.update({'catalog': 1, 'history': 1}) return self.session.get_tasks(self.identifier, params, request_kwargs) - def get_history( - self, - params: Mapping | None = None, - request_kwargs: Mapping | None = None, - ) -> list[catalog.CatalogTask]: + def get_history(self, + params: dict | None = None, + request_kwargs: dict | None = None, + ) -> list[catalog.CatalogTask]: """Get a list of completed catalog tasks for the item. :param params: Params to send with your request. @@ -306,11 +316,10 @@ def get_history( """ return list(self.session.iter_history(self.identifier, params, request_kwargs)) - def get_catalog( - self, - params: Mapping | None = None, - request_kwargs: Mapping | None = None, - ) -> list[catalog.CatalogTask]: + def get_catalog(self, + params: dict | None = None, + request_kwargs: dict | None = None, + ) -> list[catalog.CatalogTask]: """Get a list of pending catalog tasks for the item. :param params: Params to send with your request. @@ -323,9 +332,10 @@ def derive(self, priority: int = 0, remove_derived: str | None = None, reduced_priority: bool = False, - data: MutableMapping | None = None, - headers: Mapping | None = None, - request_kwargs: Mapping | None = None) -> Response: + data: dict | None = None, + headers: dict | None = None, + request_kwargs: Mapping | None = None + ) -> Response: """Derive an item. :param priority: Task priority from 10 to -10 [default: 0] @@ -368,11 +378,12 @@ def derive(self, def fixer(self, ops: list | str | None = None, - priority: int | str | None = None, + priority: int = 0, reduced_priority: bool = False, - data: MutableMapping | None = None, - headers: Mapping | None = None, - request_kwargs: Mapping | None = None) -> Response: + data: dict | None = None, + headers: dict | None = None, + request_kwargs: Mapping | None = None + ) -> Response: """Submit a fixer task on an item. :param ops: The fixer operation(s) to run on the item @@ -414,10 +425,11 @@ def fixer(self, def undark(self, comment: str, - priority: int | str | None = None, + priority: int = 0, reduced_priority: bool = False, - data: Mapping | None = None, - request_kwargs: Mapping | None = None) -> Response: + data: dict | None = None, + request_kwargs: Mapping | None = None + ) -> Response: """Undark the item. :param comment: The curation comment explaining reason for @@ -451,10 +463,11 @@ def undark(self, # TODO: dark and undark have different order for data and reduced_pripoity def dark(self, comment: str, - priority: int | str | None = None, - data: Mapping | None = None, + priority: int = 0, + data: dict | None = None, reduced_priority: bool = False, - request_kwargs: Mapping | None = None) -> Response: + request_kwargs: Mapping | None = None + ) -> Response: """Dark the item. :param comment: The curation comment explaining reason for @@ -493,7 +506,11 @@ def get_review(self) -> Response: r.raise_for_status() return r - def index_review(self, username=None, screenname=None, itemname=None) -> Response: + def index_review(self, + username = None, + screenname = None, + itemname = None + ) -> Response: u = f'{self.session.protocol}//{self.session.host}/services/reviews.php' p = {'identifier': self.identifier} d = {'noindex': '0'} @@ -508,7 +525,11 @@ def index_review(self, username=None, screenname=None, itemname=None) -> Respons r.raise_for_status() return r - def noindex_review(self, username=None, screenname=None, itemname=None) -> Response: + def noindex_review(self, + username = None, + screenname = None, + itemname = None + ) -> Response: u = f'{self.session.protocol}//{self.session.host}/services/reviews.php' p = {'identifier': self.identifier} d = {'noindex': '1'} @@ -523,7 +544,11 @@ def noindex_review(self, username=None, screenname=None, itemname=None) -> Respo r.raise_for_status() return r - def delete_review(self, username=None, screenname=None, itemname=None) -> Response: + def delete_review(self, + username = None, + screenname = None, + itemname = None + ) -> Response: u = f'{self.session.protocol}//{self.session.host}/services/reviews.php' p = {'identifier': self.identifier} d = None @@ -538,7 +563,11 @@ def delete_review(self, username=None, screenname=None, itemname=None) -> Respon r.raise_for_status() return r - def review(self, title, body, stars=None) -> Response: + def review(self, + title: str, + body: str, + stars = None + ) -> Response: u = f'{self.session.protocol}//{self.session.host}/services/reviews.php' p = {'identifier': self.identifier} d = {'title': title, 'body': body} @@ -549,7 +578,10 @@ def review(self, title, body, stars=None) -> Response: r.raise_for_status() return r - def get_file(self, file_name: str, file_metadata: Mapping | None = None) -> File: + def get_file(self, + file_name: str, + file_metadata: Mapping | None = None + ) -> File: """Get a :class:`File ` object for the named file. :param file_metadata: a dict of metadata for the @@ -564,7 +596,8 @@ def get_files(self, formats: str | list[str] | None = None, glob_pattern: str | list[str] | None = None, exclude_pattern: str | list[str] | None = None, - on_the_fly: bool = False): + on_the_fly: bool = False + ) -> Generator[File]: files = files or [] formats = formats or [] exclude_pattern = exclude_pattern or '' @@ -634,7 +667,7 @@ def download(self, stdout: bool = False, params: Mapping | None = None, timeout: float | tuple[int, float] | None = None - ) -> list[Request | Response]: + ) -> list[Request | Response]: """Download files from an item. :param files: Only download files matching given file names. @@ -743,17 +776,17 @@ def download(self, return [] if files: - files = self.get_files(files, on_the_fly=on_the_fly) + files = self.get_files(files, on_the_fly=on_the_fly) # type: ignore else: - files = self.get_files(on_the_fly=on_the_fly) + files = self.get_files(on_the_fly=on_the_fly) # type: ignore if formats: - files = self.get_files(formats=formats, on_the_fly=on_the_fly) + files = self.get_files(formats=formats, on_the_fly=on_the_fly) # type: ignore if glob_pattern: files = self.get_files( glob_pattern=glob_pattern, exclude_pattern=exclude_pattern, on_the_fly=on_the_fly - ) + ) # type: ignore if stdout: files = list(files) # type: ignore @@ -815,9 +848,10 @@ def modify_metadata(self, debug: bool = False, headers: Mapping | None = None, reduced_priority: bool = False, - request_kwargs: Mapping | None = None, + request_kwargs: dict | None = None, timeout: float | None = None, - refresh: bool = True) -> Request | Response: + refresh: bool = True + ) -> Response | Request: """Modify the metadata of an existing item on Archive.org. Note: The Metadata Write API does not yet comply with the @@ -868,7 +902,7 @@ def modify_metadata(self, else: request_kwargs["timeout"] = 60 # type: ignore - _headers = self.session.headers.copy() + _headers = deepcopy(self.session.headers) _headers.update(headers) url = f'{self.session.protocol}//{self.session.host}/metadata/{self.identifier}' @@ -902,10 +936,10 @@ def modify_metadata(self, return resp def delete_flag( - self, - category: str, - user: Optional[str] = None, # noqa: UP007 - ) -> Response: + self, + category: str, + user: Optional[str] = None, # noqa: UP007 + ) -> Response: if user is None: user = f"@{self.session.config.get('general', {}).get('screenname')}" url = f'{self.session.protocol}//{self.session.host}/services/flags/admin.php' @@ -915,10 +949,10 @@ def delete_flag( return r def add_flag( - self, - category: str, - user: Optional[str] = None, # noqa: UP007 - ) -> Response: + self, + category: str, + user: Optional[str] = None, # noqa: UP007 + ) -> Response: if user is None: user = f"@{self.session.config.get('general', {}).get('screenname')}" url = f'{self.session.protocol}//{self.session.host}/services/flags/admin.php' @@ -935,7 +969,10 @@ def get_flags(self) -> Response: return r # TODO: `list` parameter name shadows the Python builtin - def remove_from_simplelist(self, parent, list) -> Response: + def remove_from_simplelist(self, + parent: str | bool | dict | list | SupportsComplex | None, + list: str | bool | dict | list | SupportsComplex | None + ) -> Response: """Remove item from a simplelist. :returns: :class:`requests.Response` @@ -1137,7 +1174,7 @@ def _build_request(): if debug: prepared_request = self.session.prepare_request(_build_request()) body.close() - return prepared_request + return prepared_request # type: ignore else: try: first_try = True diff --git a/internetarchive/search.py b/internetarchive/search.py index ab741023..3984e77d 100644 --- a/internetarchive/search.py +++ b/internetarchive/search.py @@ -26,20 +26,71 @@ :copyright: (C) 2012-2024 by Internet Archive. :license: AGPL 3, see LICENSE for more details. """ -import itertools +from __future__ import annotations + from logging import getLogger +from typing import Generator, Iterable from requests.exceptions import ReadTimeout +from urllib3 import Retry from internetarchive.auth import S3Auth +from internetarchive.item import Item +from internetarchive.session import ArchiveSession log = getLogger(__name__) +class SearchIterator(list): + """This class is an iterator wrapper for search results. + + It provides access to the underlying Search, and supports + len() (since that is known initially).""" + + def __init__(self, search: Search, iterator: Iterable[dict | Item]): + self.search = search + self.iterator = iterator + + def __len__(self) -> int: + return int(self.search.num_found) # type: ignore + + def __next__(self) -> dict | Item: + return next(self.iterator) # type: ignore + + def __iter__(self) -> SearchIterator: + return self + + def __repr__(self) -> str: + return f'{self.__class__.__name__}({self.search!r}, {self.iterator!r})' + class Search: """This class represents an archive.org item search. You can use this class to search for Archive.org items using the advanced search - engine. + engine. By default it uses the scaping API, see for + `documentation `__, + which uses the same query Lucene-like queries supported by + Internet Archive Advanced Search. See the advance search page for + `documentation `__, + when using `pages`. + + :param archive_session: A session + :type archive_session: ArchiveSession + :param query: Lucene-like query string + :type query: str + :param fields: Fields to return. This always includes `identifier`. + :type fields: list[str] or None + :param sorts: Sort by field (value: 'desc' or 'asc') + :type sorts: dict or None + :param params: ? + :type params: dict or None + :param full_text_search: Use the undocumented full text search API + :type full_text_search: bool or None + :param dsl_fts: ? + :type dsl_fts: dict or None + :param request_kwargs: kwargs passed to request + :type request_kwargs: dict or None + :param max_retries: Max retries tried by request + :type max_retries: int or Retry Usage:: @@ -51,14 +102,17 @@ class Search: ... print(result['identifier']) """ - def __init__(self, archive_session, query, - fields=None, - sorts=None, - params=None, - full_text_search=None, - dsl_fts=None, - request_kwargs=None, - max_retries=None): + def __init__(self, + archive_session: ArchiveSession, + query: str, + fields: list[str] | None = None, # UP007 + sorts: dict | None = None, + params: dict | None = None, + full_text_search: bool | None = None, + dsl_fts: bool | None = None, + request_kwargs: dict | None = None, + max_retries: int | Retry | None = None): + params = params or {} self.session = archive_session @@ -71,14 +125,14 @@ def __init__(self, archive_session, query, if self.fts and not self.dsl_fts: self.query = f'!L {self.query}' self.fields = fields or [] - self.sorts = sorts or [] + self.sorts = sorts or {} self.request_kwargs = request_kwargs or {} - self._num_found = None + self._num_found: int | None = None self.fts_url = f'{self.session.protocol}//be-api.us.archive.org/ia-pub-fts-api' self.scrape_url = f'{self.session.protocol}//{self.session.host}/services/search/v1/scrape' self.search_url = f'{self.session.protocol}//{self.session.host}/advancedsearch.php' if self.session.access_key and self.session.secret_key: - self.auth = S3Auth(self.session.access_key, self.session.secret_key) + self.auth: S3Auth | None = S3Auth(self.session.access_key, self.session.secret_key) else: self.auth = None self.max_retries = max_retries if max_retries is not None else 5 @@ -89,7 +143,7 @@ def __init__(self, archive_session, query, if 'rows' in params: params['page'] = 1 else: - default_params['count'] = 10000 + default_params["count"] = "10000" else: default_params['output'] = 'json' # In the beta endpoint 'scope' was called 'index'. @@ -105,15 +159,42 @@ def __init__(self, archive_session, query, self.request_kwargs['timeout'] = 300 # Set retries. - self.session.mount_http_adapter(max_retries=self.max_retries) + self.session.mount_http_adapter(max_retries=self.max_retries) # type: ignore def __repr__(self): return f'Search(query={self.query!r})' - def __iter__(self): + def __iter__(self) -> SearchIterator: return self.iter_as_results() - def _advanced_search(self): + def _get_item_from_search_result( + self, + search_result: SearchIterator + ) -> Item: + return self.session.get_item(search_result['identifier']) # type: ignore + + def iter_as_results(self) -> SearchIterator: + return SearchIterator(self, self._make_results_generator()) # type: ignore + + def iter_as_items(self) -> SearchIterator: + """Returns an iterator over the fetched :class:`internetarchive.item.Item`s. + + This fetches an :class:`internetarchive.item.Item` from IA. + """ + _map = map(self._get_item_from_search_result, self._make_results_generator()) # type: ignore + return SearchIterator(self, _map) + + def _make_results_generator(self) -> Generator[dict, None, None]: + if self.fts: + return self._full_text_search() + if 'user_aggs' in self.params: + return self._user_aggs() + elif 'page' in self.params: + return self._advanced_search() + else: + return self._scrape() + + def _advanced_search(self) -> Generator[dict, None, None]: # Always return identifier. if 'identifier' not in self.fields: self.fields.append('identifier') @@ -137,7 +218,7 @@ def _advanced_search(self): yield j yield from j.get('response', {}).get('docs', []) - def _scrape(self): + def _scrape(self) -> Generator[dict, None, None]: if self.fields: self.params['fields'] = ','.join(self.fields) if self.sorts: @@ -168,7 +249,7 @@ def _scrape(self): f' allotted amount of time for {r.request.url}') break - def _full_text_search(self): + def _full_text_search(self) -> Generator[dict, None, None]: d = { 'q': self.query, 'size': '10000', @@ -180,7 +261,7 @@ def _full_text_search(self): d['scope'] = self.params['scope'] if 'size' in self.params: - d['scroll'] = False + d['scroll'] = str(False) d['size'] = self.params['size'] while True: @@ -198,17 +279,7 @@ def _full_text_search(self): break d['scroll_id'] = scroll_id - def _make_results_generator(self): - if self.fts: - return self._full_text_search() - if 'user_aggs' in self.params: - return self._user_aggs() - elif 'page' in self.params: - return self._advanced_search() - else: - return self._scrape() - - def _user_aggs(self): + def _user_aggs(self) -> Generator[dict, None, None]: """Experimental support for user aggregations. """ del self.params['count'] # advanced search will error if this param is present! @@ -226,7 +297,7 @@ def _user_aggs(self): yield {agg[0]: agg[1]} @property - def num_found(self): + def num_found(self) -> int | None: if not self._num_found: if not self.fts and 'page' in self.params: p = self.params.copy() @@ -247,6 +318,7 @@ def num_found(self): auth=self.auth, **self.request_kwargs) j = r.json() + log.info(r.url) self._handle_scrape_error(j) self._num_found = j.get('total') else: @@ -259,7 +331,7 @@ def num_found(self): self._num_found = j.get('hits', {}).get('total') return self._num_found - def _handle_scrape_error(self, j): + def _handle_scrape_error(self, j: dict) -> None: if 'error' in j: if all(s in j['error'].lower() for s in ['invalid', 'secret']): if not j['error'].endswith('.'): @@ -267,38 +339,5 @@ def _handle_scrape_error(self, j): raise ValueError(f"{j['error']} Try running 'ia configure' and retrying.") raise ValueError(j.get('error')) - def _get_item_from_search_result(self, search_result): - return self.session.get_item(search_result['identifier']) - - def iter_as_results(self): - return SearchIterator(self, self._make_results_generator()) - - def iter_as_items(self): - _map = map(self._get_item_from_search_result, self._make_results_generator()) - return SearchIterator(self, _map) - - def __len__(self): + def __len__(self) -> int | None: return self.num_found - - -class SearchIterator: - """This class is an iterator wrapper for search results. - - It provides access to the underlying Search, and supports - len() (since that is known initially).""" - - def __init__(self, search, iterator): - self.search = search - self.iterator = iterator - - def __len__(self): - return self.search.num_found - - def __next__(self): - return next(self.iterator) - - def __iter__(self): - return self - - def __repr__(self): - return f'{self.__class__.__name__}({self.search!r}, {self.iterator!r})' diff --git a/internetarchive/session.py b/internetarchive/session.py index fe85f57e..af20d11d 100644 --- a/internetarchive/session.py +++ b/internetarchive/session.py @@ -45,7 +45,8 @@ from requests.utils import default_headers from urllib3 import Retry -from internetarchive import __version__, auth, catalog +import internetarchive +from internetarchive import auth, catalog from internetarchive.config import get_config from internetarchive.item import Collection, Item from internetarchive.search import Search @@ -145,7 +146,7 @@ def _get_user_agent_string(self) -> str: except Exception: lang = '' py_version = '{}.{}.{}'.format(*sys.version_info) - return (f'internetarchive/{__version__} ' + return (f'internetarchive/{internetarchive.__version__} ' f'({uname[0]} {uname[-1]}; N; {lang}; {self.access_key}) ' f'Python/{py_version}') @@ -302,12 +303,12 @@ def get_metadata(self, identifier: str, request_kwargs: MutableMapping | None = def search_items(self, query: str, - fields: Iterable[str] | None = None, - sorts: Iterable[str] | None = None, - params: Mapping | None = None, + fields: list[str] | None = None, + sorts: dict | None = None, + params: dict | None = None, full_text_search: bool = False, dsl_fts: bool = False, - request_kwargs: Mapping | None = None, + request_kwargs: dict | None = None, max_retries: int | Retry | None = None) -> Search: """Search for items on Archive.org. @@ -357,18 +358,22 @@ def s3_is_overloaded(self, identifier=None, access_key=None, request_kwargs=None return True return j.get('over_limit') != 0 - def get_tasks_api_rate_limit(self, cmd: str = 'derive.php', request_kwargs: dict | None = None): + def get_tasks_api_rate_limit(self, + cmd: str = 'derive.php', + request_kwargs: dict | None = None + ): return catalog.Catalog(self, request_kwargs).get_rate_limit(cmd=cmd) def submit_task(self, - identifier: str, - cmd: str, + identifier: str = "", + cmd: str ="", comment: str = '', priority: int = 0, data: dict | None = None, headers: dict | None = None, reduced_priority: bool = False, - request_kwargs: Mapping | None = None) -> requests.Response: + request_kwargs: Mapping | None = None + ) -> requests.Response: """Submit an archive.org task. :param identifier: Item identifier. @@ -479,7 +484,8 @@ def get_tasks_summary(self, identifier: str = "", def get_tasks(self, identifier: str = "", params: dict | None = None, - request_kwargs: Mapping | None = None) -> set[catalog.CatalogTask]: + request_kwargs: Mapping | None = None + ) -> set[catalog.CatalogTask]: """Get a list of all tasks meeting all criteria. The list is ordered by submission time.