From f4f8504729db3d5fbfeffff26e2b0416bdb51074 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Wed, 13 Dec 2023 16:56:01 -0800 Subject: [PATCH] Start the messy path to removing requests This starts the path toward removing the requests package by rejiggering *most* of our exception handling. There is a huge amount more to do and this doesn't even begin to start using urllib3 to make requests, so basically everything should be completely broken with this commit. Part of #58. --- wayback/_client.py | 108 ++++++++++++++++++++++++++++++--------------- wayback/_utils.py | 10 ++--- 2 files changed, 78 insertions(+), 40 deletions(-) diff --git a/wayback/_client.py b/wayback/_client.py index 0593e8f..df8cf7c 100644 --- a/wayback/_client.py +++ b/wayback/_client.py @@ -19,19 +19,22 @@ import hashlib import logging import re -import requests -from requests.exceptions import (ChunkedEncodingError, - ConnectionError, - ContentDecodingError, - ProxyError, - RetryError, - Timeout) +# import requests +# from requests.exceptions import (ChunkedEncodingError, +# ConnectionError, +# ContentDecodingError, +# ProxyError, +# RetryError, +# Timeout) import time from urllib.parse import urljoin, urlparse from urllib3.connectionpool import HTTPConnectionPool from urllib3.exceptions import (ConnectTimeoutError, MaxRetryError, - ReadTimeoutError) + ReadTimeoutError, + ProxyError, + TimeoutError, + ProtocolError) from warnings import warn from . import _utils, __version__ from ._models import CdxRecord, Memento @@ -158,6 +161,8 @@ def cdx_hash(content): return b32encode(hashlib.sha1(content).digest()).decode() +# XXX: see how requests reads the body: +# https://github.com/psf/requests/blob/a25fde6989f8df5c3d823bc9f2e2fc24aa71f375/src/requests/models.py#L794-L839 def read_and_close(response): # Read content so it gets cached and close the response so # we can release the connection for reuse. See: @@ -336,10 +341,10 @@ def _new_header_init(self, headers=None, **kwargs): ##################################################################### -class WaybackSession(_utils.DisableAfterCloseSession, requests.Session): +class WaybackSession(_utils.DisableAfterCloseSession): """ - A custom session object that pools network connections and resources for - requests to the Wayback Machine. + Manages HTTP requests to Wayback Machine servers, handling things like + retries, rate limiting, connection pooling, timeouts, etc. Parameters ---------- @@ -395,11 +400,22 @@ class WaybackSession(_utils.DisableAfterCloseSession, requests.Session): # they make sense to retry here. Usually not in other contexts, though. retryable_statuses = frozenset((413, 421, 500, 502, 503, 504, 599)) + # XXX: TimeoutError should be a base class for both ConnectTimeoutError + # and ReadTimeoutError, so we don't need them here...? retryable_errors = (ConnectTimeoutError, MaxRetryError, ReadTimeoutError, - ProxyError, RetryError, Timeout) + ProxyError, TimeoutError, + # XXX: These used to be wrapped with + # requests.ConnectionError, which we would then have to + # inspect to see if it needed retrying. Need to make + # sure/think through whether these should be retried. + ProtocolError, OSError) # Handleable errors *may* be retryable, but need additional logic beyond # just the error type. See `should_retry_error()`. - handleable_errors = (ConnectionError,) + retryable_errors + # + # XXX: see notes above about what should get retried; which things need to + # be caught but then more deeply inspected, blah blah blah: + # handleable_errors = (ConnectionError,) + retryable_errors + handleable_errors = () + retryable_errors def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None, search_calls_per_second=DEFAULT_CDX_RATE_LIMIT, @@ -463,6 +479,17 @@ def send(self, request: requests.PreparedRequest, **kwargs): else: logger.debug('Received error response (status: %s), will retry', response.status_code) read_and_close(response) + # XXX: urllib3's MaxRetryError can wrap all the other errors, so + # we should probably be checking `error.reason` on it. See how + # requests handles this: https://github.com/psf/requests/blob/a25fde6989f8df5c3d823bc9f2e2fc24aa71f375/src/requests/adapters.py#L502-L537 + # + # XXX: requests.RetryError used to be in our list of handleable + # errors; it gets raised when urllib3 raises a MaxRetryError with a + # ResponseError for its `reason` attribute. Need to test the + # situation here... + # + # XXX: Consider how read-related exceptions need to be handled (or + # not). In requests: https://github.com/psf/requests/blob/a25fde6989f8df5c3d823bc9f2e2fc24aa71f375/src/requests/models.py#L794-L839 except WaybackSession.handleable_errors as error: response = getattr(error, 'response', None) if response is not None: @@ -507,17 +534,23 @@ def should_retry(self, response): def should_retry_error(self, error): if isinstance(error, WaybackSession.retryable_errors): return True - elif isinstance(error, ConnectionError): - # ConnectionErrors from requests actually wrap a whole family of - # more detailed errors from urllib3, so we need to do some string - # checking to determine whether the error is retryable. - text = str(error) - # NOTE: we have also seen this, which may warrant retrying: - # `requests.exceptions.ConnectionError: ('Connection aborted.', - # RemoteDisconnected('Remote end closed connection without - # response'))` - if 'NewConnectionError' in text or 'Max retries' in text: - return True + # XXX: ConnectionError was a broad wrapper from requests; there are more + # narrow errors in urllib3 we can catch, so this is probably (???) no + # longer relevant. But urllib3 has some other wrapper exceptions that we + # might need to dig into more, see: + # https://github.com/psf/requests/blob/a25fde6989f8df5c3d823bc9f2e2fc24aa71f375/src/requests/adapters.py#L502-L537 + # + # elif isinstance(error, ConnectionError): + # # ConnectionErrors from requests actually wrap a whole family of + # # more detailed errors from urllib3, so we need to do some string + # # checking to determine whether the error is retryable. + # text = str(error) + # # NOTE: we have also seen this, which may warrant retrying: + # # `requests.exceptions.ConnectionError: ('Connection aborted.', + # # RemoteDisconnected('Remote end closed connection without + # # response'))` + # if 'NewConnectionError' in text or 'Max retries' in text: + # return True return False @@ -537,6 +570,11 @@ def get_retry_delay(self, retries, response=None): return delay + # XXX: Needs to do the right thing. Requests sessions closed all their + # adapters, which does: + # self.poolmanager.clear() + # for proxy in self.proxy_manager.values(): + # proxy.clear() def reset(self): "Reset any network connections the session is using." # Close really just closes all the adapters in `self.adapters`. We @@ -785,22 +823,22 @@ def search(self, url, *, match_type=None, limit=1000, offset=None, sent_query, next_query = next_query, None response = self.session.request('GET', CDX_SEARCH_URL, params=sent_query) - try: - # Read/cache the response and close straightaway. If we need - # to raise for status, we want to pre-emptively close the - # response so a user handling the error doesn't need to - # worry about it. If we don't raise here, we still want to - # close the connection so it doesn't leak when we move onto - # the next of results or when this iterator ends. - read_and_close(response) - response.raise_for_status() - except requests.exceptions.HTTPError as error: + + # Read/cache the response and close straightaway. If we need + # to raise an exception based on the response, we want to + # pre-emptively close it so a user handling the error doesn't need + # to worry about it. If we don't raise here, we still want to + # close the connection so it doesn't leak when we move onto + # the next page of results or when this iterator ends. + read_and_close(response) + + if response.status >= 400: if 'AdministrativeAccessControlException' in response.text: raise BlockedSiteError(query['url']) elif 'RobotAccessControlException' in response.text: raise BlockedByRobotsError(query['url']) else: - raise WaybackException(str(error)) + raise WaybackException(f'HTTP {response.status} error for CDX search: "{query}"') lines = iter(response.content.splitlines()) diff --git a/wayback/_utils.py b/wayback/_utils.py index 4d8fa05..aab15e8 100644 --- a/wayback/_utils.py +++ b/wayback/_utils.py @@ -4,8 +4,6 @@ import email.utils import logging import re -import requests -import requests.adapters import threading import time from typing import Union @@ -322,20 +320,22 @@ def __exit_all__(self, type, value, traceback): pass -class DisableAfterCloseSession(requests.Session): +class DisableAfterCloseSession: """ A custom session object raises a :class:`SessionClosedError` if you try to use it after closing it, to help identify and avoid potentially dangerous code patterns. (Standard session objects continue to be usable after closing, even if they may not work exactly as expected.) """ - _closed = False + _closed: bool = False - def close(self, disable=True): + def close(self, disable: bool = True) -> None: super().close() if disable: self._closed = True + # XXX: this no longer works correctly, we probably need some sort of + # decorator or something def send(self, *args, **kwargs): if self._closed: raise SessionClosedError('This session has already been closed '