Skip to content

Commit 4597a7c

Browse files
committed
Fixes figshare API
1 parent d40e39d commit 4597a7c

File tree

3 files changed

+56
-104
lines changed

3 files changed

+56
-104
lines changed

coderdata/dataset.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
figshare: https://api.figshare.com/v2/articles/29923646/files?page=1&page_size=500
1+
figshare: https://api.figshare.com/v2/articles/29923646
22
version: 2.2.0
33
datasets:
44
beataml:

coderdata/download/downloader.py

Lines changed: 53 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -7,59 +7,8 @@
77
import os
88
import requests
99
import warnings
10-
import yaml
11-
from typing import Iterable, List, Dict, Any, Optional
12-
13-
14-
15-
def _gather_files_from_response(resp: requests.Response) -> List[Dict[str, Any]]:
16-
"""
17-
Normalize Figshare API responses into a list of file dicts.
18-
19-
Supports:
20-
1) Article endpoint: https://api.figshare.com/v2/articles/{id}
21-
-> JSON object with key 'files' (list)
22-
23-
2) Files endpoint: https://api.figshare.com/v2/articles/{id}/files[?...]
24-
-> JSON list of file objects (possibly paginated with Link headers)
25-
"""
26-
data = resp.json()
27-
if isinstance(data, dict) and "files" in data and isinstance(data["files"], list):
28-
return data["files"]
29-
if isinstance(data, list):
30-
return data
31-
raise ValueError("Unexpected Figshare API response structure; expected dict with 'files' "
32-
"or a list of file objects.")
33-
34-
35-
def _iter_paginated_files(url: str, session: Optional[requests.Session] = None) -> Iterable[Dict[str, Any]]:
36-
"""
37-
Iterate over all files, following 'Link: <...>; rel=\"next\"' pagination if present.
38-
Works for both the article endpoint (no pagination) and the files endpoint (may paginate).
39-
"""
40-
sess = session or requests.Session()
41-
next_url = url
42-
43-
while next_url:
44-
resp = sess.get(next_url)
45-
if resp.status_code != 200:
46-
raise Exception(f"Failed to get dataset details from Figshare: {resp.text}")
47-
48-
for f in _gather_files_from_response(resp):
49-
yield f
5010

51-
# RFC5988-style 'Link' header pagination
52-
link = resp.headers.get("Link") or resp.headers.get("link")
53-
next_url = None
54-
if link:
55-
parts = [p.strip() for p in link.split(",")]
56-
for part in parts:
57-
if 'rel="next"' in part:
58-
start = part.find("<") + 1
59-
end = part.find(">", start)
60-
if start > 0 and end > start:
61-
next_url = part[start:end]
62-
break
11+
import yaml
6312

6413
def download(
6514
name: str='all',
@@ -97,73 +46,81 @@ def download(
9746
local_path = Path(local_path)
9847

9948
if not local_path.exists():
100-
local_path.mkdir(parents=True, exist_ok=True)
49+
Path.mkdir(local_path)
10150
# Get the dataset details
10251
with resources.open_text('coderdata', 'dataset.yml') as f:
10352
data_information = yaml.load(f, Loader=yaml.FullLoader)
10453
url = data_information['figshare']
54+
55+
response = requests.get(url)
56+
if response.status_code != 200:
57+
raise Exception(
58+
f"Failed to get dataset details from Figshare: {response.text}"
59+
)
60+
61+
data = response.json()
10562

106-
name = (name or "all").casefold()
107-
session = requests.Session()
108-
all_files = list(_iter_paginated_files(url, session=session))
63+
# making sure that we are case insensitive
64+
name = name.casefold()
10965

66+
# Filter files by the specified prefix
11067
if name != "all":
11168
filtered_files = [
112-
f for f in all_files
113-
if (f.get('name', '').casefold().startswith(name)) or ('genes' in f.get('name', '').casefold())
114-
]
69+
file
70+
for file
71+
in data['files']
72+
if file['name'].startswith(name) or 'genes' in file['name']
73+
]
11574
else:
116-
filtered_files = all_files
75+
filtered_files = data['files']
11776

77+
# Group files by name and select the one with the highest ID
11878
unique_files = {}
11979
for file in filtered_files:
120-
fname = file.get('name')
121-
fid = file.get('id')
122-
if fname is None or fid is None:
123-
continue
124-
file_name = local_path.joinpath(fname)
125-
if (file_name not in unique_files) or (fid > unique_files[file_name]['id']):
126-
unique_files[file_name] = {'file_info': file, 'id': fid}
80+
file_name = local_path.joinpath(file['name'])
81+
file_id = file['id']
82+
if (
83+
file_name not in unique_files
84+
or file_id > unique_files[file_name]['id']
85+
):
86+
unique_files[file_name] = {'file_info': file, 'id': file_id}
12787

12888
for file_name, file_data in unique_files.items():
12989
file_info = file_data['file_info']
13090
file_id = str(file_info['id'])
131-
file_url = f"https://api.figshare.com/v2/file/download/{file_id}"
132-
file_md5sum = file_info.get('supplied_md5')
133-
134-
if file_name.exists() and not exist_ok:
135-
warnings.warn(
136-
f"{file_name} already exists. Use argument 'exist_ok=True' to overwrite the existing file."
137-
)
138-
91+
file_url = "https://api.figshare.com/v2/file/download/" + file_id
92+
file_md5sum = file_info['supplied_md5']
13993
retry_count = 10
94+
# Download the file
14095
while retry_count > 0:
141-
with session.get(file_url, stream=True) as r:
96+
with requests.get(file_url, stream=True) as r:
14297
r.raise_for_status()
143-
with open(file_name, 'wb') as f:
144-
for chunk in r.iter_content(chunk_size=8192):
145-
f.write(chunk)
146-
147-
if file_md5sum:
148-
with open(file_name, 'rb') as f:
149-
check_md5sum = md5(f.read()).hexdigest()
150-
if file_md5sum == check_md5sum:
151-
break
152-
else:
153-
retry_count -= 1
154-
if retry_count > 0:
155-
warnings.warn(
156-
f"{file_name} failed MD5 verification "
157-
f"(expected: {file_md5sum}, got: {check_md5sum}). Retrying..."
98+
if file_name.exists() and not exist_ok:
99+
warnings.warn(
100+
f"{file_name} already exists. Use argument 'exist_ok=True'"
101+
"to overwrite existing file."
158102
)
159-
else:
103+
else:
104+
with open(file_name, 'wb') as f:
105+
for chunk in r.iter_content(chunk_size=8192):
106+
f.write(chunk)
107+
with open(file_name, 'rb') as f:
108+
check_md5sum = md5(f.read()).hexdigest()
109+
if file_md5sum == check_md5sum:
160110
break
161-
162-
if retry_count == 0 and file_md5sum:
111+
elif retry_count > 0:
112+
warnings.warn(
113+
f"{file_name} could not be downloaded successfully. "
114+
f"(expected md5sum: {file_md5sum} - "
115+
f"calculated md5sum: {check_md5sum})... retrying..."
116+
)
117+
retry_count = retry_count - 1
118+
if retry_count == 0:
163119
warnings.warn(
164-
f"{file_name} could not be downloaded with a matching MD5 after retries."
165-
)
120+
f"{file_name} could not be downloaded. Try again."
121+
)
166122
else:
167123
print(f"Downloaded '{file_url}' to '{file_name}'")
168124

125+
return
169126

scripts/push_to_figshare.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ def write_figshare_details_to_yaml(article_id, project_id, title, version):
197197
# update dataset.yml
198198
with open("coderdata/dataset.yml", "r") as f:
199199
data = yaml.safe_load(f)
200-
data["figshare"] = f"https://api.figshare.com/v2/articles/{article_id}/files?page=1&page_size=500"
200+
data["figshare"] = f"https://api.figshare.com/v2/articles/{article_id}"
201201
data["version"] = version
202202
with open('/tmp/dataset.yml', 'w') as f:
203203
yaml.safe_dump(data, f, sort_keys=False)
@@ -232,12 +232,7 @@ def write_figshare_details_to_yaml(article_id, project_id, title, version):
232232
remote_file_info = get_remote_file_info(article_id, file_name)
233233
if remote_file_info:
234234
local_md5, local_size = get_file_check_data(file_path)
235-
remote_md5 = (
236-
remote_file_info.get('computed_md5')
237-
or remote_file_info.get('md5')
238-
or remote_file_info.get('supplied_md5')
239-
)
240-
if remote_file_info.get('size') != local_size or remote_md5 != local_md5:
235+
if remote_file_info['size'] != local_size or remote_file_info['computed_md5'] != local_md5:
241236
print(f"Updating file {file_name} in Figshare...")
242237
delete_existing_file(article_id, remote_file_info['id'])
243238
file_info = initiate_new_upload(article_id, file_path)

0 commit comments

Comments
 (0)