Skip to content

Commit

Permalink
Start the messy path to removing requests
Browse files Browse the repository at this point in the history
This starts the path toward removing the requests package by rejiggering *most* of our exception handling. There is a huge amount more to do and this doesn't even begin to start using urllib3 to make requests, so basically everything should be completely broken with this commit.

Part of #58.
  • Loading branch information
Mr0grog committed Dec 16, 2023
1 parent 790ad74 commit f4f8504
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 40 deletions.
108 changes: 73 additions & 35 deletions wayback/_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,22 @@
import hashlib
import logging
import re
import requests
from requests.exceptions import (ChunkedEncodingError,
ConnectionError,
ContentDecodingError,
ProxyError,
RetryError,
Timeout)
# import requests
# from requests.exceptions import (ChunkedEncodingError,
# ConnectionError,
# ContentDecodingError,
# ProxyError,
# RetryError,
# Timeout)
import time
from urllib.parse import urljoin, urlparse
from urllib3.connectionpool import HTTPConnectionPool
from urllib3.exceptions import (ConnectTimeoutError,
MaxRetryError,
ReadTimeoutError)
ReadTimeoutError,
ProxyError,
TimeoutError,
ProtocolError)
from warnings import warn
from . import _utils, __version__
from ._models import CdxRecord, Memento
Expand Down Expand Up @@ -158,6 +161,8 @@ def cdx_hash(content):
return b32encode(hashlib.sha1(content).digest()).decode()


# XXX: see how requests reads the body:
# https://github.com/psf/requests/blob/a25fde6989f8df5c3d823bc9f2e2fc24aa71f375/src/requests/models.py#L794-L839
def read_and_close(response):
# Read content so it gets cached and close the response so
# we can release the connection for reuse. See:
Expand Down Expand Up @@ -336,10 +341,10 @@ def _new_header_init(self, headers=None, **kwargs):
#####################################################################


class WaybackSession(_utils.DisableAfterCloseSession, requests.Session):
class WaybackSession(_utils.DisableAfterCloseSession):
"""
A custom session object that pools network connections and resources for
requests to the Wayback Machine.
Manages HTTP requests to Wayback Machine servers, handling things like
retries, rate limiting, connection pooling, timeouts, etc.
Parameters
----------
Expand Down Expand Up @@ -395,11 +400,22 @@ class WaybackSession(_utils.DisableAfterCloseSession, requests.Session):
# they make sense to retry here. Usually not in other contexts, though.
retryable_statuses = frozenset((413, 421, 500, 502, 503, 504, 599))

# XXX: TimeoutError should be a base class for both ConnectTimeoutError
# and ReadTimeoutError, so we don't need them here...?
retryable_errors = (ConnectTimeoutError, MaxRetryError, ReadTimeoutError,
ProxyError, RetryError, Timeout)
ProxyError, TimeoutError,
# XXX: These used to be wrapped with
# requests.ConnectionError, which we would then have to
# inspect to see if it needed retrying. Need to make
# sure/think through whether these should be retried.
ProtocolError, OSError)
# Handleable errors *may* be retryable, but need additional logic beyond
# just the error type. See `should_retry_error()`.
handleable_errors = (ConnectionError,) + retryable_errors
#
# XXX: see notes above about what should get retried; which things need to
# be caught but then more deeply inspected, blah blah blah:
# handleable_errors = (ConnectionError,) + retryable_errors
handleable_errors = () + retryable_errors

def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None,
search_calls_per_second=DEFAULT_CDX_RATE_LIMIT,
Expand Down Expand Up @@ -463,6 +479,17 @@ def send(self, request: requests.PreparedRequest, **kwargs):
else:
logger.debug('Received error response (status: %s), will retry', response.status_code)
read_and_close(response)
# XXX: urllib3's MaxRetryError can wrap all the other errors, so
# we should probably be checking `error.reason` on it. See how
# requests handles this: https://github.com/psf/requests/blob/a25fde6989f8df5c3d823bc9f2e2fc24aa71f375/src/requests/adapters.py#L502-L537
#
# XXX: requests.RetryError used to be in our list of handleable
# errors; it gets raised when urllib3 raises a MaxRetryError with a
# ResponseError for its `reason` attribute. Need to test the
# situation here...
#
# XXX: Consider how read-related exceptions need to be handled (or
# not). In requests: https://github.com/psf/requests/blob/a25fde6989f8df5c3d823bc9f2e2fc24aa71f375/src/requests/models.py#L794-L839
except WaybackSession.handleable_errors as error:
response = getattr(error, 'response', None)
if response is not None:
Expand Down Expand Up @@ -507,17 +534,23 @@ def should_retry(self, response):
def should_retry_error(self, error):
if isinstance(error, WaybackSession.retryable_errors):
return True
elif isinstance(error, ConnectionError):
# ConnectionErrors from requests actually wrap a whole family of
# more detailed errors from urllib3, so we need to do some string
# checking to determine whether the error is retryable.
text = str(error)
# NOTE: we have also seen this, which may warrant retrying:
# `requests.exceptions.ConnectionError: ('Connection aborted.',
# RemoteDisconnected('Remote end closed connection without
# response'))`
if 'NewConnectionError' in text or 'Max retries' in text:
return True
# XXX: ConnectionError was a broad wrapper from requests; there are more
# narrow errors in urllib3 we can catch, so this is probably (???) no
# longer relevant. But urllib3 has some other wrapper exceptions that we
# might need to dig into more, see:
# https://github.com/psf/requests/blob/a25fde6989f8df5c3d823bc9f2e2fc24aa71f375/src/requests/adapters.py#L502-L537
#
# elif isinstance(error, ConnectionError):
# # ConnectionErrors from requests actually wrap a whole family of
# # more detailed errors from urllib3, so we need to do some string
# # checking to determine whether the error is retryable.
# text = str(error)
# # NOTE: we have also seen this, which may warrant retrying:
# # `requests.exceptions.ConnectionError: ('Connection aborted.',
# # RemoteDisconnected('Remote end closed connection without
# # response'))`
# if 'NewConnectionError' in text or 'Max retries' in text:
# return True

return False

Expand All @@ -537,6 +570,11 @@ def get_retry_delay(self, retries, response=None):

return delay

# XXX: Needs to do the right thing. Requests sessions closed all their
# adapters, which does:
# self.poolmanager.clear()
# for proxy in self.proxy_manager.values():
# proxy.clear()
def reset(self):
"Reset any network connections the session is using."
# Close really just closes all the adapters in `self.adapters`. We
Expand Down Expand Up @@ -785,22 +823,22 @@ def search(self, url, *, match_type=None, limit=1000, offset=None,
sent_query, next_query = next_query, None
response = self.session.request('GET', CDX_SEARCH_URL,
params=sent_query)
try:
# Read/cache the response and close straightaway. If we need
# to raise for status, we want to pre-emptively close the
# response so a user handling the error doesn't need to
# worry about it. If we don't raise here, we still want to
# close the connection so it doesn't leak when we move onto
# the next of results or when this iterator ends.
read_and_close(response)
response.raise_for_status()
except requests.exceptions.HTTPError as error:

# Read/cache the response and close straightaway. If we need
# to raise an exception based on the response, we want to
# pre-emptively close it so a user handling the error doesn't need
# to worry about it. If we don't raise here, we still want to
# close the connection so it doesn't leak when we move onto
# the next page of results or when this iterator ends.
read_and_close(response)

if response.status >= 400:
if 'AdministrativeAccessControlException' in response.text:
raise BlockedSiteError(query['url'])
elif 'RobotAccessControlException' in response.text:
raise BlockedByRobotsError(query['url'])
else:
raise WaybackException(str(error))
raise WaybackException(f'HTTP {response.status} error for CDX search: "{query}"')

lines = iter(response.content.splitlines())

Expand Down
10 changes: 5 additions & 5 deletions wayback/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import email.utils
import logging
import re
import requests
import requests.adapters
import threading
import time
from typing import Union
Expand Down Expand Up @@ -322,20 +320,22 @@ def __exit_all__(self, type, value, traceback):
pass


class DisableAfterCloseSession(requests.Session):
class DisableAfterCloseSession:
"""
A custom session object raises a :class:`SessionClosedError` if you try to
use it after closing it, to help identify and avoid potentially dangerous
code patterns. (Standard session objects continue to be usable after
closing, even if they may not work exactly as expected.)
"""
_closed = False
_closed: bool = False

def close(self, disable=True):
def close(self, disable: bool = True) -> None:
super().close()
if disable:
self._closed = True

# XXX: this no longer works correctly, we probably need some sort of
# decorator or something
def send(self, *args, **kwargs):
if self._closed:
raise SessionClosedError('This session has already been closed '
Expand Down

0 comments on commit f4f8504

Please sign in to comment.