Skip to content

Commit e9c2564

Browse files
authored
feat: Adds optional support for giving a complete URL as the ID field (#22)
With this feature, it is now supported to pass the complete URL as the `id` field and this application will do a best-effort attempt at inferring the ID from the URL. This might not work for all URLs, as the ID can be stored in different ways. If the ID cannot be parsed from the URL, please fall back to manually parsing the ID field.
1 parent 718dc8e commit e9c2564

File tree

3 files changed

+31
-6
lines changed

3 files changed

+31
-6
lines changed

README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ usage: hathitrust-downloader [-h] [--name NAME] id start_page end_page
5151
Book downloader for HathiTrust
5252

5353
positional arguments:
54-
id The ID of the book, e.g 'mdp.39015027794331'.
54+
id The ID of the book, e.g 'mdp.39015027794331' or a complete URL.
5555
start_page The page number of the first page to be downloaded.
5656
end_page The last number of the last page to be downloaded (inclusive).
5757

@@ -72,6 +72,7 @@ hathitrust-downloader mdp.39015073487137 1 10 --name my-book
7272
> https://babel.hathitrust.org/cgi/pt?id=mdp.39015073487137&seq=13
7373
> ^^^^^^^^^^^^^^^^^^ This demarks the ID of this book
7474
> ```
75+
> Alternatively, you can provide the complete URL as the ID argument, and the tool will attempt to parse the ID from the URL. Note that this feature is best-effort, and for optimal stability, it is still recommended to provide the specific ID directly.
7576
7677
## Troubleshooting
7778

hathitrustdownloader/cli.py

+23-5
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,35 @@
33
import requests
44
import time
55
import argparse
6+
from urllib.parse import urlparse, parse_qs
7+
8+
def extract_id_from_url(url):
9+
"""
10+
Extracts the ID parameter from a HathiTrust URL.
11+
12+
Args:
13+
url (str): The complete URL containing the ID parameter.
14+
15+
Returns:
16+
str: The extracted ID value or None if not found.
17+
"""
18+
parsed_url = urlparse(url)
19+
query_params = parse_qs(parsed_url.query)
20+
return query_params.get('id', [None])[0]
621

722
def main():
823
parser = argparse.ArgumentParser(description='Book downloader for HathiTrust')
924

10-
parser.add_argument('id', type=str, help="The ID of the book, e.g 'mdp.39015027794331'.")
25+
parser.add_argument('id', type=str, help="The ID of the book, e.g 'mdp.39015027794331' or a complete URL.")
1126
parser.add_argument('start_page', type=int, help="The page number of the first page to be downloaded.")
1227
parser.add_argument('end_page', type=int, help="The last number of the last page to be downloaded (inclusive).")
1328
parser.add_argument('--name', dest='name', type=str, help="The start of the filename. Defaults to using the id. This can also be used to change the path.")
1429

1530
args = parser.parse_args()
1631

32+
# Extract ID from URL if necessary
33+
book_id = extract_id_from_url(args.id) if args.id.startswith("http") else args.id
34+
1735
# If --name is used to specify a path, extract the directory part
1836
if args.name:
1937
directory = os.path.dirname(args.name)
@@ -26,20 +44,20 @@ def main():
2644
return
2745

2846
page_numbers = [i for i in range(args.start_page - 1, args.end_page)]
29-
urls = ["https://babel.hathitrust.org/cgi/imgsrv/download/pdf?id=%s;orient=0;size=100;seq=%s;attachment=0" % (args.id, i + 1) for i in page_numbers]
47+
urls = ["https://babel.hathitrust.org/cgi/imgsrv/download/pdf?id=%s;orient=0;size=100;seq=%s;attachment=0" % (book_id, i + 1) for i in page_numbers]
3048

3149
for page_number, url in tqdm(zip(page_numbers, urls), unit="pages", total=len(urls)):
32-
filename = "%s_p%s.pdf" % (args.name or args.id, str(page_number).zfill(6))
50+
filename = "%s_p%s.pdf" % (args.name or book_id, str(page_number).zfill(6))
3351

3452
while True:
3553
try:
3654
response = requests.get(url, stream=True)
3755

3856
if response.status_code == 404:
39-
print(f"Error: Page {page_number} for book with ID '{args.id}' not found.")
57+
print(f"Error: Page {page_number} for book with ID '{book_id}' not found.")
4058
exit(1)
4159
elif response.status_code == 500:
42-
print(f"Error: The server failed to serve page {page_number} for book '{args.id}', this could indicate that the book identifier is invalid.")
60+
print(f"Error: The server failed to serve page {page_number} for book '{book_id}', this could indicate that the book identifier is invalid.")
4361
return
4462
elif response.ok:
4563
with open(filename, "wb") as handle:

tests/test_hathitrustdownloader.bats

+6
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,9 @@ teardown() {
3535
[ -f "$TMP_DIR/test_multiple_pages_p000001.pdf" ]
3636
[ -f "$TMP_DIR/test_multiple_pages_p000002.pdf" ]
3737
}
38+
39+
@test "Download using a complete URL" {
40+
run hathitrust-downloader "https://babel.hathitrust.org/cgi/pt?id=mdp.39015027794331&seq=1" 1 1 --name "$TMP_DIR/test_url_page"
41+
[ "$status" -eq 0 ]
42+
[ -f "$TMP_DIR/test_url_page_p000000.pdf" ]
43+
}

0 commit comments

Comments
 (0)