From 15580a7cb8e2abfcc9dbcf9190952e4bfccd2fa4 Mon Sep 17 00:00:00 2001
From: Dave <datadavev@users.noreply.github.com>
Date: Mon, 3 Aug 2020 08:12:44 -0400
Subject: [PATCH 1/3] Addresses issue #128 for requests document loader

---
 lib/pyld/documentloader/requests.py | 47 +++++++++++++++++------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/lib/pyld/documentloader/requests.py b/lib/pyld/documentloader/requests.py
index 77f42e1d..2b3ea958 100644
--- a/lib/pyld/documentloader/requests.py
+++ b/lib/pyld/documentloader/requests.py
@@ -10,40 +10,37 @@
 .. moduleauthor:: Olaf Conradi <olaf@conradi.org>
 """
 import string
+import re
 import urllib.parse as urllib_parse
 
 from pyld.jsonld import (JsonLdError, parse_link_header, LINK_HEADER_REL)
 
 
-def requests_document_loader(secure=False, **kwargs):
+def requests_document_loader(secure=False, max_link_follows=2, **kwargs):
     """
     Create a Requests document loader.
-
     Can be used to setup extra Requests args such as verify, cert, timeout,
     or others.
-
     :param secure: require all requests to use HTTPS (default: False).
+    :param max_link_follows: Maximum number of alternate link follows allowed.
     :param **kwargs: extra keyword args for Requests get() call.
-
     :return: the RemoteDocument loader function.
     """
     import requests
 
-    def loader(url, options={}):
+    def loader(url, options={}, link_follow_count=0):
         """
         Retrieves JSON-LD at the given URL.
-
         :param url: the URL to retrieve.
-
         :return: the RemoteDocument.
         """
         try:
             # validate URL
             pieces = urllib_parse.urlparse(url)
             if (not all([pieces.scheme, pieces.netloc]) or
-                pieces.scheme not in ['http', 'https'] or
-                set(pieces.netloc) > set(
-                    string.ascii_letters + string.digits + '-.:')):
+                    pieces.scheme not in ['http', 'https'] or
+                    set(pieces.netloc) > set(
+                        string.ascii_letters + string.digits + '-.:')):
                 raise JsonLdError(
                     'URL could not be dereferenced; only "http" and "https" '
                     'URLs are supported.',
@@ -69,23 +66,30 @@ def loader(url, options={}):
                 'contentType': content_type,
                 'contextUrl': None,
                 'documentUrl': response.url,
-                'document': response.json()
+                'document': None
             }
+            try:
+                doc['document'] = response.json()
+            except json.JSONDecodeError as e:
+                # document body is not parseable, continue to check link headers
+                pass
+            # if content_type in headers['Accept']:
+            #    doc['document'] = response.json()
             link_header = response.headers.get('link')
             if link_header:
                 linked_context = parse_link_header(link_header).get(
                     LINK_HEADER_REL)
                 # only 1 related link header permitted
                 if linked_context and content_type != 'application/ld+json':
-                  if isinstance(linked_context, list):
-                      raise JsonLdError(
-                          'URL could not be dereferenced, '
-                          'it has more than one '
-                          'associated HTTP Link Header.',
-                          'jsonld.LoadDocumentError',
-                          {'url': url},
-                          code='multiple context link headers')
-                  doc['contextUrl'] = linked_context['target']
+                    if isinstance(linked_context, list):
+                        raise JsonLdError(
+                            'URL could not be dereferenced, '
+                            'it has more than one '
+                            'associated HTTP Link Header.',
+                            'jsonld.LoadDocumentError',
+                            {'url': url},
+                            code='multiple context link headers')
+                    doc['contextUrl'] = linked_context['target']
                 linked_alternate = parse_link_header(link_header).get('alternate')
                 # if not JSON-LD, alternate may point there
                 if (linked_alternate and
@@ -93,6 +97,9 @@ def loader(url, options={}):
                         not re.match(r'^application\/(\w*\+)?json$', content_type)):
                     doc['contentType'] = 'application/ld+json'
                     doc['documentUrl'] = jsonld.prepend_base(url, linked_alternate['target'])
+                    if link_follow_count >= max_link_follows:
+                        raise requests.TooManyRedirects(f"Exceeded maximum link header redirects ({max_link_follows})")
+                    return loader(doc['documentUrl'], options=options, link_follow_count=link_follow_count + 1)
             return doc
         except JsonLdError as e:
             raise e

From 9f452e56206a0d218b7738098a294bf5ebff2b36 Mon Sep 17 00:00:00 2001
From: Dave <datadavev@users.noreply.github.com>
Date: Mon, 3 Aug 2020 08:19:39 -0400
Subject: [PATCH 2/3] Addresses issue #128 for requests document loader, typos

---
 lib/pyld/documentloader/requests.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/pyld/documentloader/requests.py b/lib/pyld/documentloader/requests.py
index 2b3ea958..037ff660 100644
--- a/lib/pyld/documentloader/requests.py
+++ b/lib/pyld/documentloader/requests.py
@@ -12,8 +12,9 @@
 import string
 import re
 import urllib.parse as urllib_parse
+from json import JSONDecodeError
 
-from pyld.jsonld import (JsonLdError, parse_link_header, LINK_HEADER_REL)
+from pyld.jsonld import (JsonLdError, parse_link_header, prepend_base, LINK_HEADER_REL)
 
 
 def requests_document_loader(secure=False, max_link_follows=2, **kwargs):
@@ -70,7 +71,7 @@ def loader(url, options={}, link_follow_count=0):
             }
             try:
                 doc['document'] = response.json()
-            except json.JSONDecodeError as e:
+            except JSONDecodeError as e:
                 # document body is not parseable, continue to check link headers
                 pass
             # if content_type in headers['Accept']:
@@ -96,7 +97,7 @@ def loader(url, options={}, link_follow_count=0):
                         linked_alternate.get('type') == 'application/ld+json' and
                         not re.match(r'^application\/(\w*\+)?json$', content_type)):
                     doc['contentType'] = 'application/ld+json'
-                    doc['documentUrl'] = jsonld.prepend_base(url, linked_alternate['target'])
+                    doc['documentUrl'] = prepend_base(url, linked_alternate['target'])
                     if link_follow_count >= max_link_follows:
                         raise requests.TooManyRedirects(f"Exceeded maximum link header redirects ({max_link_follows})")
                     return loader(doc['documentUrl'], options=options, link_follow_count=link_follow_count + 1)

From 07913cf10c4c0563fb6b3da619e645c6c68270ff Mon Sep 17 00:00:00 2001
From: Dave <datadavev@users.noreply.github.com>
Date: Mon, 3 Aug 2020 08:33:53 -0400
Subject: [PATCH 3/3] Check content_type match instead of trapping failed parse

---
 lib/pyld/documentloader/requests.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/lib/pyld/documentloader/requests.py b/lib/pyld/documentloader/requests.py
index 037ff660..5e550ecd 100644
--- a/lib/pyld/documentloader/requests.py
+++ b/lib/pyld/documentloader/requests.py
@@ -69,11 +69,10 @@ def loader(url, options={}, link_follow_count=0):
                 'documentUrl': response.url,
                 'document': None
             }
-            try:
+            # Try loading the JSON if the content_type matches
+            # A failure here means the response body is not valid json
+            if re.match(r'^application\/(\w*\+)?json$', content_type):
                 doc['document'] = response.json()
-            except JSONDecodeError as e:
-                # document body is not parseable, continue to check link headers
-                pass
             # if content_type in headers['Accept']:
             #    doc['document'] = response.json()
             link_header = response.headers.get('link')