diff --git a/lib/pyld/documentloader/requests.py b/lib/pyld/documentloader/requests.py index 77f42e1d..5e550ecd 100644 --- a/lib/pyld/documentloader/requests.py +++ b/lib/pyld/documentloader/requests.py @@ -10,40 +10,38 @@ .. moduleauthor:: Olaf Conradi """ import string +import re import urllib.parse as urllib_parse +from json import JSONDecodeError -from pyld.jsonld import (JsonLdError, parse_link_header, LINK_HEADER_REL) +from pyld.jsonld import (JsonLdError, parse_link_header, prepend_base, LINK_HEADER_REL) -def requests_document_loader(secure=False, **kwargs): +def requests_document_loader(secure=False, max_link_follows=2, **kwargs): """ Create a Requests document loader. - Can be used to setup extra Requests args such as verify, cert, timeout, or others. - :param secure: require all requests to use HTTPS (default: False). + :param max_link_follows: Maximum number of alternate link follows allowed. :param **kwargs: extra keyword args for Requests get() call. - :return: the RemoteDocument loader function. """ import requests - def loader(url, options={}): + def loader(url, options={}, link_follow_count=0): """ Retrieves JSON-LD at the given URL. - :param url: the URL to retrieve. - :return: the RemoteDocument. """ try: # validate URL pieces = urllib_parse.urlparse(url) if (not all([pieces.scheme, pieces.netloc]) or - pieces.scheme not in ['http', 'https'] or - set(pieces.netloc) > set( - string.ascii_letters + string.digits + '-.:')): + pieces.scheme not in ['http', 'https'] or + set(pieces.netloc) > set( + string.ascii_letters + string.digits + '-.:')): raise JsonLdError( 'URL could not be dereferenced; only "http" and "https" ' 'URLs are supported.', @@ -69,30 +67,39 @@ def loader(url, options={}): 'contentType': content_type, 'contextUrl': None, 'documentUrl': response.url, - 'document': response.json() + 'document': None } + # Try loading the JSON if the content_type matches + # A failure here means the response body is not valid json + if re.match(r'^application\/(\w*\+)?json$', content_type): + doc['document'] = response.json() + # if content_type in headers['Accept']: + # doc['document'] = response.json() link_header = response.headers.get('link') if link_header: linked_context = parse_link_header(link_header).get( LINK_HEADER_REL) # only 1 related link header permitted if linked_context and content_type != 'application/ld+json': - if isinstance(linked_context, list): - raise JsonLdError( - 'URL could not be dereferenced, ' - 'it has more than one ' - 'associated HTTP Link Header.', - 'jsonld.LoadDocumentError', - {'url': url}, - code='multiple context link headers') - doc['contextUrl'] = linked_context['target'] + if isinstance(linked_context, list): + raise JsonLdError( + 'URL could not be dereferenced, ' + 'it has more than one ' + 'associated HTTP Link Header.', + 'jsonld.LoadDocumentError', + {'url': url}, + code='multiple context link headers') + doc['contextUrl'] = linked_context['target'] linked_alternate = parse_link_header(link_header).get('alternate') # if not JSON-LD, alternate may point there if (linked_alternate and linked_alternate.get('type') == 'application/ld+json' and not re.match(r'^application\/(\w*\+)?json$', content_type)): doc['contentType'] = 'application/ld+json' - doc['documentUrl'] = jsonld.prepend_base(url, linked_alternate['target']) + doc['documentUrl'] = prepend_base(url, linked_alternate['target']) + if link_follow_count >= max_link_follows: + raise requests.TooManyRedirects(f"Exceeded maximum link header redirects ({max_link_follows})") + return loader(doc['documentUrl'], options=options, link_follow_count=link_follow_count + 1) return doc except JsonLdError as e: raise e