Skip to content

Commit b084859

Browse files
committed
Merge branch 'release/0.1.2'
2 parents eed3ae6 + 2769e46 commit b084859

File tree

4 files changed

+97
-2
lines changed

4 files changed

+97
-2
lines changed

src/db.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,27 @@ def get_canonical_title(work) -> str:
104104

105105
return getattr(work, "fullTitle", None)
106106

107+
@staticmethod
108+
def get_page_range(work: Dict) -> str:
109+
"""Return a chapter page range from Thoth metadata."""
110+
page_interval = work.get("pageInterval")
111+
if page_interval:
112+
return page_interval
113+
114+
first_page = work.get("firstPage")
115+
last_page = work.get("lastPage")
116+
117+
if first_page and last_page:
118+
return f"{first_page}-{last_page}"
119+
120+
if first_page:
121+
return f"{first_page}-{first_page}"
122+
123+
if last_page:
124+
return f"{last_page}-{last_page}"
125+
126+
return None
127+
107128
def get_book(self) -> Dict:
108129
"""Return book data"""
109130
work = self.db.work_by_doi(doi=self.doi)
@@ -124,6 +145,8 @@ def get_chapters(self, book: Dict) -> List:
124145
copyrightHolder
125146
longAbstract
126147
pageInterval
148+
firstPage
149+
lastPage
127150
doi
128151
license
129152
imprint {
@@ -150,7 +173,7 @@ def get_chapters(self, book: Dict) -> List:
150173
"title": work.get("fullTitle"),
151174
"publisher": work.get("imprint", {}).get("imprintName"),
152175
"abstract": work.get("longAbstract"),
153-
"pages": work.get("pageInterval"),
176+
"pages": self.get_page_range(work),
154177
"licence": work.get("license"),
155178
"workId": work.get("workId")}
156179
chapters.append(data)

src/main.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,14 @@ def run(input_file: Path = typer.Option("./file.pdf",
3333

3434
# Iterate over chapters metadata
3535
for chapter in metadata.get_chapters():
36-
page_range = re.split('-|–', chapter.get("pages"))
36+
pages = chapter.get("pages")
37+
if not pages:
38+
raise ValueError(
39+
"Missing page range for chapter "
40+
f"{chapter.get('doi') or chapter.get('title')}"
41+
)
42+
43+
page_range = re.split('-|–', pages)
3744

3845
doi_fragments = chapter.get("doi").split('/')
3946
output_file_name = doi_fragments[-1].lower() + '.pdf'

src/test_db.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,18 @@ def work_by_doi(self, doi):
3232
}
3333

3434

35+
def test_thoth_get_page_range_falls_back_to_first_and_last_page():
36+
assert Thoth.get_page_range(
37+
{"pageInterval": None, "firstPage": "11", "lastPage": "20"}
38+
) == "11-20"
39+
40+
41+
def test_thoth_get_page_range_uses_single_page_when_needed():
42+
assert Thoth.get_page_range(
43+
{"pageInterval": None, "firstPage": "7", "lastPage": None}
44+
) == "7-7"
45+
46+
3547
def test_thoth_write_urls_uses_pat_and_preserves_payloads(monkeypatch):
3648
class MockDb:
3749
def __init__(self):

src/test_main.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,56 @@ def write_urls(self, chapter):
9797
"workId": "work-1",
9898
}
9999
]
100+
101+
102+
def test_cli_raises_clear_error_when_pages_missing(monkeypatch, tmp_path):
103+
input_file = tmp_path / "book.pdf"
104+
output_folder = tmp_path / "output"
105+
input_file.write_bytes(b"%PDF-1.4\n")
106+
output_folder.mkdir()
107+
108+
class MockPdf:
109+
def __init__(self, source, tmp_dir):
110+
self.tmp_dir = Path(tmp_dir)
111+
112+
def merge_pdfs(self, page_range, output_file_name):
113+
raise AssertionError("merge_pdfs should not be called")
114+
115+
class MockMetadata:
116+
def __init__(self, database, doi):
117+
pass
118+
119+
def get_chapters(self):
120+
return [
121+
{
122+
"pages": None,
123+
"doi": "10.11647/obp.0309.01",
124+
"workId": "work-1",
125+
}
126+
]
127+
128+
def write_metadata(self, chapter, output_file_path):
129+
raise AssertionError("write_metadata should not be called")
130+
131+
def write_urls(self, chapter):
132+
raise AssertionError("write_urls should not be called")
133+
134+
monkeypatch.setattr(main, "Pdf", MockPdf)
135+
monkeypatch.setattr(main, "Metadata", MockMetadata)
136+
137+
result = runner.invoke(
138+
main.app,
139+
[
140+
"--input-file",
141+
str(input_file),
142+
"--output-folder",
143+
str(output_folder),
144+
"10.11647/obp.0309",
145+
],
146+
)
147+
148+
assert result.exit_code != 0
149+
assert isinstance(result.exception, ValueError)
150+
assert str(result.exception) == (
151+
"Missing page range for chapter 10.11647/obp.0309.01"
152+
)

0 commit comments

Comments
 (0)