From 15580a7cb8e2abfcc9dbcf9190952e4bfccd2fa4 Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 3 Aug 2020 08:12:44 -0400 Subject: [PATCH 1/3] Addresses issue #128 for requests document loader --- lib/pyld/documentloader/requests.py | 47 +++++++++++++++++------------ 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/lib/pyld/documentloader/requests.py b/lib/pyld/documentloader/requests.py index 77f42e1d..2b3ea958 100644 --- a/lib/pyld/documentloader/requests.py +++ b/lib/pyld/documentloader/requests.py @@ -10,40 +10,37 @@ .. moduleauthor:: Olaf Conradi """ import string +import re import urllib.parse as urllib_parse from pyld.jsonld import (JsonLdError, parse_link_header, LINK_HEADER_REL) -def requests_document_loader(secure=False, **kwargs): +def requests_document_loader(secure=False, max_link_follows=2, **kwargs): """ Create a Requests document loader. - Can be used to setup extra Requests args such as verify, cert, timeout, or others. - :param secure: require all requests to use HTTPS (default: False). + :param max_link_follows: Maximum number of alternate link follows allowed. :param **kwargs: extra keyword args for Requests get() call. - :return: the RemoteDocument loader function. """ import requests - def loader(url, options={}): + def loader(url, options={}, link_follow_count=0): """ Retrieves JSON-LD at the given URL. - :param url: the URL to retrieve. - :return: the RemoteDocument. """ try: # validate URL pieces = urllib_parse.urlparse(url) if (not all([pieces.scheme, pieces.netloc]) or - pieces.scheme not in ['http', 'https'] or - set(pieces.netloc) > set( - string.ascii_letters + string.digits + '-.:')): + pieces.scheme not in ['http', 'https'] or + set(pieces.netloc) > set( + string.ascii_letters + string.digits + '-.:')): raise JsonLdError( 'URL could not be dereferenced; only "http" and "https" ' 'URLs are supported.', @@ -69,23 +66,30 @@ def loader(url, options={}): 'contentType': content_type, 'contextUrl': None, 'documentUrl': response.url, - 'document': response.json() + 'document': None } + try: + doc['document'] = response.json() + except json.JSONDecodeError as e: + # document body is not parseable, continue to check link headers + pass + # if content_type in headers['Accept']: + # doc['document'] = response.json() link_header = response.headers.get('link') if link_header: linked_context = parse_link_header(link_header).get( LINK_HEADER_REL) # only 1 related link header permitted if linked_context and content_type != 'application/ld+json': - if isinstance(linked_context, list): - raise JsonLdError( - 'URL could not be dereferenced, ' - 'it has more than one ' - 'associated HTTP Link Header.', - 'jsonld.LoadDocumentError', - {'url': url}, - code='multiple context link headers') - doc['contextUrl'] = linked_context['target'] + if isinstance(linked_context, list): + raise JsonLdError( + 'URL could not be dereferenced, ' + 'it has more than one ' + 'associated HTTP Link Header.', + 'jsonld.LoadDocumentError', + {'url': url}, + code='multiple context link headers') + doc['contextUrl'] = linked_context['target'] linked_alternate = parse_link_header(link_header).get('alternate') # if not JSON-LD, alternate may point there if (linked_alternate and @@ -93,6 +97,9 @@ def loader(url, options={}): not re.match(r'^application\/(\w*\+)?json$', content_type)): doc['contentType'] = 'application/ld+json' doc['documentUrl'] = jsonld.prepend_base(url, linked_alternate['target']) + if link_follow_count >= max_link_follows: + raise requests.TooManyRedirects(f"Exceeded maximum link header redirects ({max_link_follows})") + return loader(doc['documentUrl'], options=options, link_follow_count=link_follow_count + 1) return doc except JsonLdError as e: raise e From 9f452e56206a0d218b7738098a294bf5ebff2b36 Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 3 Aug 2020 08:19:39 -0400 Subject: [PATCH 2/3] Addresses issue #128 for requests document loader, typos --- lib/pyld/documentloader/requests.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/pyld/documentloader/requests.py b/lib/pyld/documentloader/requests.py index 2b3ea958..037ff660 100644 --- a/lib/pyld/documentloader/requests.py +++ b/lib/pyld/documentloader/requests.py @@ -12,8 +12,9 @@ import string import re import urllib.parse as urllib_parse +from json import JSONDecodeError -from pyld.jsonld import (JsonLdError, parse_link_header, LINK_HEADER_REL) +from pyld.jsonld import (JsonLdError, parse_link_header, prepend_base, LINK_HEADER_REL) def requests_document_loader(secure=False, max_link_follows=2, **kwargs): @@ -70,7 +71,7 @@ def loader(url, options={}, link_follow_count=0): } try: doc['document'] = response.json() - except json.JSONDecodeError as e: + except JSONDecodeError as e: # document body is not parseable, continue to check link headers pass # if content_type in headers['Accept']: @@ -96,7 +97,7 @@ def loader(url, options={}, link_follow_count=0): linked_alternate.get('type') == 'application/ld+json' and not re.match(r'^application\/(\w*\+)?json$', content_type)): doc['contentType'] = 'application/ld+json' - doc['documentUrl'] = jsonld.prepend_base(url, linked_alternate['target']) + doc['documentUrl'] = prepend_base(url, linked_alternate['target']) if link_follow_count >= max_link_follows: raise requests.TooManyRedirects(f"Exceeded maximum link header redirects ({max_link_follows})") return loader(doc['documentUrl'], options=options, link_follow_count=link_follow_count + 1) From 07913cf10c4c0563fb6b3da619e645c6c68270ff Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 3 Aug 2020 08:33:53 -0400 Subject: [PATCH 3/3] Check content_type match instead of trapping failed parse --- lib/pyld/documentloader/requests.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/pyld/documentloader/requests.py b/lib/pyld/documentloader/requests.py index 037ff660..5e550ecd 100644 --- a/lib/pyld/documentloader/requests.py +++ b/lib/pyld/documentloader/requests.py @@ -69,11 +69,10 @@ def loader(url, options={}, link_follow_count=0): 'documentUrl': response.url, 'document': None } - try: + # Try loading the JSON if the content_type matches + # A failure here means the response body is not valid json + if re.match(r'^application\/(\w*\+)?json$', content_type): doc['document'] = response.json() - except JSONDecodeError as e: - # document body is not parseable, continue to check link headers - pass # if content_type in headers['Accept']: # doc['document'] = response.json() link_header = response.headers.get('link')