Fixes figshare API

jjacobson95 · jjacobson95 · commit 4597a7cc07d0 · 2025-09-30T10:49:02.000-07:00
diff --git a/coderdata/dataset.yml b/coderdata/dataset.yml
@@ -1,4 +1,4 @@
-figshare: https://api.figshare.com/v2/articles/29923646/files?page=1&page_size=500
+figshare: https://api.figshare.com/v2/articles/29923646
 version: 2.2.0
 datasets:
   beataml:
diff --git a/coderdata/download/downloader.py b/coderdata/download/downloader.py
@@ -7,59 +7,8 @@
 import os
 import requests
 import warnings
-import yaml
-from typing import Iterable, List, Dict, Any, Optional
-    
-    
-    
-def _gather_files_from_response(resp: requests.Response) -> List[Dict[str, Any]]:
-    """
-    Normalize Figshare API responses into a list of file dicts.
-
-    Supports:
-      1) Article endpoint:   https://api.figshare.com/v2/articles/{id}
-         -> JSON object with key 'files' (list)
-
-      2) Files endpoint:     https://api.figshare.com/v2/articles/{id}/files[?...]
-         -> JSON list of file objects (possibly paginated with Link headers)
-    """
-    data = resp.json()
-    if isinstance(data, dict) and "files" in data and isinstance(data["files"], list):
-        return data["files"]
-    if isinstance(data, list):
-        return data
-    raise ValueError("Unexpected Figshare API response structure; expected dict with 'files' "
-                     "or a list of file objects.")
-
-
-def _iter_paginated_files(url: str, session: Optional[requests.Session] = None) -> Iterable[Dict[str, Any]]:
-    """
-    Iterate over all files, following 'Link: <...>; rel=\"next\"' pagination if present.
-    Works for both the article endpoint (no pagination) and the files endpoint (may paginate).
-    """
-    sess = session or requests.Session()
-    next_url = url
-
-    while next_url:
-        resp = sess.get(next_url)
-        if resp.status_code != 200:
-            raise Exception(f"Failed to get dataset details from Figshare: {resp.text}")
-
-        for f in _gather_files_from_response(resp):
-            yield f
 
-        # RFC5988-style 'Link' header pagination
-        link = resp.headers.get("Link") or resp.headers.get("link")
-        next_url = None
-        if link:
-            parts = [p.strip() for p in link.split(",")]
-            for part in parts:
-                if 'rel="next"' in part:
-                    start = part.find("<") + 1
-                    end = part.find(">", start)
-                    if start > 0 and end > start:
-                        next_url = part[start:end]
-                        break
+import yaml
 
 def download(
         name: str='all',
@@ -97,73 +46,81 @@ def download(
         local_path = Path(local_path)
 
     if not local_path.exists():
-        local_path.mkdir(parents=True, exist_ok=True)
+        Path.mkdir(local_path)
     # Get the dataset details
     with resources.open_text('coderdata', 'dataset.yml') as f:
         data_information = yaml.load(f, Loader=yaml.FullLoader)
     url = data_information['figshare']
+    
+    response = requests.get(url)
+    if response.status_code != 200:
+        raise Exception(
+            f"Failed to get dataset details from Figshare: {response.text}"
+            )
+
+    data = response.json()
 
-    name = (name or "all").casefold()
-    session = requests.Session()
-    all_files = list(_iter_paginated_files(url, session=session))
+    # making sure that we are case insensitive
+    name = name.casefold()
 
+    # Filter files by the specified prefix
     if name != "all":
         filtered_files = [
-            f for f in all_files
-            if (f.get('name', '').casefold().startswith(name)) or ('genes' in f.get('name', '').casefold())
-        ]
+            file 
+            for file 
+            in data['files'] 
+            if file['name'].startswith(name) or 'genes' in file['name']
+            ]
     else:
-        filtered_files = all_files
+        filtered_files = data['files']
 
+    # Group files by name and select the one with the highest ID
     unique_files = {}
     for file in filtered_files:
-        fname = file.get('name')
-        fid = file.get('id')
-        if fname is None or fid is None:
-            continue
-        file_name = local_path.joinpath(fname)
-        if (file_name not in unique_files) or (fid > unique_files[file_name]['id']):
-            unique_files[file_name] = {'file_info': file, 'id': fid}
+        file_name = local_path.joinpath(file['name'])
+        file_id = file['id']
+        if (
+            file_name not in unique_files
+            or file_id > unique_files[file_name]['id']
+        ):
+            unique_files[file_name] = {'file_info': file, 'id': file_id}
 
     for file_name, file_data in unique_files.items():
         file_info = file_data['file_info']
         file_id = str(file_info['id'])
-        file_url = f"https://api.figshare.com/v2/file/download/{file_id}"
-        file_md5sum = file_info.get('supplied_md5')
-
-        if file_name.exists() and not exist_ok:
-            warnings.warn(
-                f"{file_name} already exists. Use argument 'exist_ok=True' to overwrite the existing file."
-            )
-
+        file_url = "https://api.figshare.com/v2/file/download/" + file_id
+        file_md5sum = file_info['supplied_md5']
         retry_count = 10
+        # Download the file
         while retry_count > 0:
-            with session.get(file_url, stream=True) as r:
+            with requests.get(file_url, stream=True) as r:
                 r.raise_for_status()
-                with open(file_name, 'wb') as f:
-                    for chunk in r.iter_content(chunk_size=8192):
-                        f.write(chunk)
-
-            if file_md5sum:
-                with open(file_name, 'rb') as f:
-                    check_md5sum = md5(f.read()).hexdigest()
-                if file_md5sum == check_md5sum:
-                    break
-                else:
-                    retry_count -= 1
-                    if retry_count > 0:
-                        warnings.warn(
-                            f"{file_name} failed MD5 verification "
-                            f"(expected: {file_md5sum}, got: {check_md5sum}). Retrying..."
+                if file_name.exists() and not exist_ok:
+                    warnings.warn(
+                        f"{file_name} already exists. Use argument 'exist_ok=True'"
+                        "to overwrite existing file."
                         )
-            else:
+                else:
+                    with open(file_name, 'wb') as f:
+                        for chunk in r.iter_content(chunk_size=8192): 
+                            f.write(chunk)
+            with open(file_name, 'rb') as f:
+                check_md5sum = md5(f.read()).hexdigest()
+            if file_md5sum == check_md5sum:
                 break
-
-        if retry_count == 0 and file_md5sum:
+            elif retry_count > 0:
+                warnings.warn(
+                    f"{file_name} could not be downloaded successfully. "
+                    f"(expected md5sum: {file_md5sum} - "
+                    f"calculated md5sum: {check_md5sum})... retrying..."
+                )
+                retry_count = retry_count - 1
+        if retry_count == 0:
             warnings.warn(
-                f"{file_name} could not be downloaded with a matching MD5 after retries."
-            )
+                f"{file_name} could not be downloaded. Try again."
+                )
         else:
             print(f"Downloaded '{file_url}' to '{file_name}'")
 
+    return
 
diff --git a/scripts/push_to_figshare.py b/scripts/push_to_figshare.py
@@ -197,7 +197,7 @@ def write_figshare_details_to_yaml(article_id, project_id, title, version):
         # update dataset.yml
         with open("coderdata/dataset.yml", "r") as f:
             data = yaml.safe_load(f)
-        data["figshare"] = f"https://api.figshare.com/v2/articles/{article_id}/files?page=1&page_size=500"
+        data["figshare"] = f"https://api.figshare.com/v2/articles/{article_id}"
         data["version"] = version
         with open('/tmp/dataset.yml', 'w') as f:
             yaml.safe_dump(data, f, sort_keys=False)       
@@ -232,12 +232,7 @@ def write_figshare_details_to_yaml(article_id, project_id, title, version):
         remote_file_info = get_remote_file_info(article_id, file_name)
         if remote_file_info:
             local_md5, local_size = get_file_check_data(file_path)
-            remote_md5 = (
-                remote_file_info.get('computed_md5')
-                or remote_file_info.get('md5')
-                or remote_file_info.get('supplied_md5')
-            )
-            if remote_file_info.get('size') != local_size or remote_md5 != local_md5:
+            if remote_file_info['size'] != local_size or remote_file_info['computed_md5'] != local_md5:
                 print(f"Updating file {file_name} in Figshare...")
                 delete_existing_file(article_id, remote_file_info['id'])
                 file_info = initiate_new_upload(article_id, file_path)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-figshare: https://api.figshare.com/v2/articles/29923646/files?page=1&page_size=500`
	`1`	`+figshare: https://api.figshare.com/v2/articles/29923646`
`2`	`2`	`version: 2.2.0`
`3`	`3`	`datasets:`
`4`	`4`	`beataml:`