feat: Adds optional support for giving a complete URL as the ID field (#22)

Addono · web-flow · commit e9c256438642 · 2024-12-27T11:33:43.000+01:00
With this feature, it is now supported to pass the complete URL as the `id` field and this application will do a best-effort attempt at inferring the ID from the URL.

This might not work for all URLs, as the ID can be stored in different ways. If the ID cannot be parsed from the URL, please fall back to manually parsing the ID field.
diff --git a/README.md b/README.md
@@ -51,7 +51,7 @@ usage: hathitrust-downloader [-h] [--name NAME] id start_page end_page
 Book downloader for HathiTrust
 
 positional arguments:
-  id           The ID of the book, e.g 'mdp.39015027794331'.
+  id           The ID of the book, e.g 'mdp.39015027794331' or a complete URL.
   start_page   The page number of the first page to be downloaded.
   end_page     The last number of the last page to be downloaded (inclusive).
 
@@ -72,6 +72,7 @@ hathitrust-downloader mdp.39015073487137 1 10 --name my-book
 > https://babel.hathitrust.org/cgi/pt?id=mdp.39015073487137&seq=13
 >                                        ^^^^^^^^^^^^^^^^^^ This demarks the ID of this book
 > ```
+> Alternatively, you can provide the complete URL as the ID argument, and the tool will attempt to parse the ID from the URL. Note that this feature is best-effort, and for optimal stability, it is still recommended to provide the specific ID directly.
 
 ## Troubleshooting
 
diff --git a/hathitrustdownloader/cli.py b/hathitrustdownloader/cli.py
@@ -3,17 +3,35 @@
 import requests
 import time
 import argparse
+from urllib.parse import urlparse, parse_qs
+
+def extract_id_from_url(url):
+    """
+    Extracts the ID parameter from a HathiTrust URL.
+
+    Args:
+        url (str): The complete URL containing the ID parameter.
+
+    Returns:
+        str: The extracted ID value or None if not found.
+    """
+    parsed_url = urlparse(url)
+    query_params = parse_qs(parsed_url.query)
+    return query_params.get('id', [None])[0]
 
 def main():
     parser = argparse.ArgumentParser(description='Book downloader for HathiTrust')
 
-    parser.add_argument('id', type=str, help="The ID of the book, e.g 'mdp.39015027794331'.")
+    parser.add_argument('id', type=str, help="The ID of the book, e.g 'mdp.39015027794331' or a complete URL.")
     parser.add_argument('start_page', type=int, help="The page number of the first page to be downloaded.")
     parser.add_argument('end_page', type=int, help="The last number of the last page to be downloaded (inclusive).")
     parser.add_argument('--name', dest='name', type=str, help="The start of the filename. Defaults to using the id. This can also be used to change the path.")
 
     args = parser.parse_args()
 
+    # Extract ID from URL if necessary
+    book_id = extract_id_from_url(args.id) if args.id.startswith("http") else args.id
+
     # If --name is used to specify a path, extract the directory part
     if args.name:
         directory = os.path.dirname(args.name)
@@ -26,20 +44,20 @@ def main():
                 return
 
     page_numbers = [i for i in range(args.start_page - 1, args.end_page)]
-    urls = ["https://babel.hathitrust.org/cgi/imgsrv/download/pdf?id=%s;orient=0;size=100;seq=%s;attachment=0" % (args.id, i + 1) for i in page_numbers]
+    urls = ["https://babel.hathitrust.org/cgi/imgsrv/download/pdf?id=%s;orient=0;size=100;seq=%s;attachment=0" % (book_id, i + 1) for i in page_numbers]
 
     for page_number, url in tqdm(zip(page_numbers, urls), unit="pages", total=len(urls)):
-        filename = "%s_p%s.pdf" % (args.name or args.id, str(page_number).zfill(6))
+        filename = "%s_p%s.pdf" % (args.name or book_id, str(page_number).zfill(6))
 
         while True:
             try:
                 response = requests.get(url, stream=True)
 
                 if response.status_code == 404:
-                    print(f"Error: Page {page_number} for book with ID '{args.id}' not found.")
+                    print(f"Error: Page {page_number} for book with ID '{book_id}' not found.")
                     exit(1)
                 elif response.status_code == 500:
-                    print(f"Error: The server failed to serve page {page_number} for book '{args.id}', this could indicate that the book identifier is invalid.")
+                    print(f"Error: The server failed to serve page {page_number} for book '{book_id}', this could indicate that the book identifier is invalid.")
                     return
                 elif response.ok:
                     with open(filename, "wb") as handle:
diff --git a/tests/test_hathitrustdownloader.bats b/tests/test_hathitrustdownloader.bats
@@ -35,3 +35,9 @@ teardown() {
   [ -f "$TMP_DIR/test_multiple_pages_p000001.pdf" ]
   [ -f "$TMP_DIR/test_multiple_pages_p000002.pdf" ]
 }
+
+@test "Download using a complete URL" {
+  run hathitrust-downloader "https://babel.hathitrust.org/cgi/pt?id=mdp.39015027794331&seq=1" 1 1 --name "$TMP_DIR/test_url_page"
+  [ "$status" -eq 0 ]
+  [ -f "$TMP_DIR/test_url_page_p000000.pdf" ]
+}