Skip to content

Commit 8006d97

Browse files
committed
Enhance Figshare author processing: update author fetching to use YAML config, improve search accuracy with user_id and institution_id, and refactor fetching logic in figshare_fetch.py. Add new author YAML files for LCAS and UOA11, and update workflows to reflect changes in author file structure.
1 parent 4beee85 commit 8006d97

File tree

8 files changed

+396
-42
lines changed

8 files changed

+396
-42
lines changed

.github/workflows/lcas-figshare-processing.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,10 @@ jobs:
7171
cd ./output
7272
if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ github.event.inputs.use_author_cache }}" = "true" ]; then
7373
echo "Running figshare_fetch.py with --use-author-cache (manually triggered)"
74-
python ../figshare_fetch.py --use-author-cache -f ../lcas-authors.txt --max-retries 30 --rate-limit-delay 0.1
74+
python ../figshare_fetch.py --use-author-cache -c ../lcas-authors.yaml --max-retries 30 --rate-limit-delay 0.1
7575
else
7676
echo "Running figshare_fetch.py without cache (default behavior)"
77-
python ../figshare_fetch.py -f ../lcas-authors.txt --max-retries 30 --rate-limit-delay 0.1
77+
python ../figshare_fetch.py -c ../lcas-authors.yaml --max-retries 30 --rate-limit-delay 0.1
7878
fi
7979
8080
- name: Run figshare bibtex (Step 2 - Generate bibtex from CSV)

.github/workflows/uoa11-figshare-processing.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,10 @@ jobs:
7171
cd ./output
7272
if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ github.event.inputs.use_author_cache }}" = "true" ]; then
7373
echo "Running figshare_fetch.py with --use-author-cache (manually triggered)"
74-
python ../figshare_fetch.py -f ../uoa11-authors.txt --use-author-cache --max-retries 30 --rate-limit-delay 0.1 -o uoa11-figshare_articles.csv -O uoa11-figshare_articles_all.csv
74+
python ../figshare_fetch.py -c ../uoa11-authors.yaml --use-author-cache --max-retries 30 --rate-limit-delay 0.1 -o uoa11-figshare_articles.csv -O uoa11-figshare_articles_all.csv
7575
else
7676
echo "Running figshare_fetch.py without cache (default behavior)"
77-
python ../figshare_fetch.py -f ../uoa11-authors.txt --rate-limit-delay 0.1 --max-retries 30 -o uoa11-figshare_articles.csv -O uoa11-figshare_articles_all.csv
77+
python ../figshare_fetch.py -c ../uoa11-authors.yaml --rate-limit-delay 0.1 --max-retries 30 -o uoa11-figshare_articles.csv -O uoa11-figshare_articles_all.csv
7878
fi
7979
8080
- name: Run figshare bibtex (Step 2 - Generate bibtex from CSV)

author.py

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,24 +13,54 @@
1313

1414

1515
class Author:
16-
def __init__(self, name, debug=False, rate_limit_delay=1.0, max_retries=5):
16+
"""Represents an author and manages their Figshare article collection.
17+
18+
This class handles retrieving, processing, and caching article metadata for
19+
a specific author from the Figshare repository.
20+
"""
21+
22+
def __init__(self, name, user_id=None, institution_id=None, orcid=None, debug=False, rate_limit_delay=1.0, max_retries=5):
23+
"""Initialize an Author instance.
24+
25+
Args:
26+
name: Author's full name (required)
27+
user_id: Figshare user ID (optional, improves search accuracy)
28+
institution_id: Institution ID for filtering articles (optional, recommended)
29+
orcid: Author's ORCID identifier (optional, for reference)
30+
debug: Enable debug logging (default: False)
31+
rate_limit_delay: Delay in seconds between API requests (default: 1.0)
32+
max_retries: Maximum retry attempts for failed API calls (default: 5)
33+
"""
1734
self.logger = getLogger("Author")
1835
if debug:
1936
self.logger.setLevel(DEBUG)
2037
self.name = name
38+
self.user_id = user_id
39+
self.institution_id = institution_id
40+
self.orcid = orcid
2141
self.fs = FigShare(rate_limit_delay=rate_limit_delay, max_retries=max_retries)
2242
self.articles = {}
2343
self.public_html_prefix = "https://repository.lincoln.ac.uk"
2444
self.df = None
2545

2646
def save(self, filename=None):
47+
"""Save author's articles and dataframe to a persistent cache file.
48+
49+
Args:
50+
filename: Path to cache file (default: '{author_name}.db')
51+
"""
2752
if filename is None:
2853
filename = f"{self.name}.db"
2954
with shelve.open(filename) as db:
3055
db['articles'] = self.articles
3156
db['df'] = self.df
3257

3358
def load(self, filename=None):
59+
"""Load author's articles and dataframe from a persistent cache file.
60+
61+
Args:
62+
filename: Path to cache file (default: '{author_name}.db')
63+
"""
3464
if filename is None:
3565
filename = f"{self.name}.db"
3666
with shelve.open(filename) as db:
@@ -39,12 +69,47 @@ def load(self, filename=None):
3969

4070

4171
def _retrieve_figshare(self, use_cache=True):
72+
"""Retrieve articles for this author from Figshare.
73+
74+
Uses the most precise search method available based on the author metadata:
75+
- If user_id and/or institution_id are available, uses articles_by_author()
76+
with filtering for more accurate results
77+
- Otherwise, falls back to simple name-based search
78+
79+
Args:
80+
use_cache: Whether to use cached API results (default: True)
81+
"""
4282
self.logger.info(f"retrieving articles for {self.name}")
43-
self.articles = self.fs.articles_by_user_name(self.name, use_cache=use_cache)
83+
84+
# Use enhanced search with user_id tracking and institution filtering when available
85+
if self.user_id or self.institution_id:
86+
if self.user_id and self.institution_id:
87+
self.logger.info(f"Using enhanced search for user_id {self.user_id} with institution_id {self.institution_id}")
88+
elif self.user_id:
89+
self.logger.info(f"Using enhanced search for user_id {self.user_id}")
90+
else:
91+
self.logger.info(f"Using enhanced search with institution_id {self.institution_id}")
92+
93+
self.articles = self.fs.articles_by_author(
94+
self.name,
95+
user_id=self.user_id,
96+
institution_id=self.institution_id,
97+
use_cache=use_cache
98+
)
99+
else:
100+
self.logger.info(f"Using basic name search (no user_id or institution_id available)")
101+
self.articles = self.fs.articles_by_user_name(self.name, use_cache=use_cache)
44102

45103
self.logger.info(f"found {len(self.articles)} articles for {self.name}")
46104

47105
def _retrieve_details(self, use_cache=True):
106+
"""Retrieve detailed metadata for each article.
107+
108+
Fetches full article details including custom fields, tags, categories, etc.
109+
110+
Args:
111+
use_cache: Whether to use cached API results (default: True)
112+
"""
48113
for article in self.articles:
49114
self.logger.info(f"retrieving details for article {article['id']}")
50115
article['details'] = self.fs.get_article(article['id'], use_cache=use_cache)

figshare_api.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,80 @@ def __post(self, url, params=None, use_cache=True):
139139
self.logger.warning(f"Received empty or invalid JSON response for POST {self.base_url + url} (status: {response.status_code})")
140140
return []
141141

142+
def articles_by_author(self, author_name, user_id=None, institution_id=None, use_cache=True):
143+
"""Search for articles by author name with optional institution filtering.
142144
145+
Uses the Figshare search API with the :author: search operator and optional
146+
institution parameter. Note: Figshare's search API does not support searching
147+
by author_id directly, so we use the author name for search and apply
148+
institution filtering to narrow results.
149+
150+
Args:
151+
author_name: The author's full name to search for (required)
152+
user_id: Figshare user ID (optional, used only for logging/reference)
153+
institution_id: Institution ID to filter articles (optional, recommended
154+
for more precise results when available)
155+
use_cache: Whether to use cached results (default: True)
156+
157+
Returns:
158+
List of article dictionaries matching the search criteria. Each article
159+
contains metadata like id, title, authors, DOI, etc.
160+
161+
Example:
162+
articles = fs.articles_by_author(
163+
"Marc Hanheide",
164+
user_id=17159320,
165+
institution_id=1068
166+
)
167+
"""
168+
params = self.__init_params()
169+
170+
# Use :author: search operator with author name
171+
# This is the only reliable way to search by author in Figshare API
172+
params["search_for"] = f':author: "{author_name}"'
173+
174+
# Add institution filter as direct parameter if provided
175+
# This significantly narrows results when multiple authors share the same name
176+
if institution_id:
177+
params["institution"] = institution_id
178+
self.logger.info(f"Filtering by institution_id: {institution_id}")
179+
180+
# Paginate through all results
181+
page = 1
182+
articles = []
183+
while True:
184+
params["page"] = page
185+
if user_id:
186+
self.logger.info(f"retrieving page {page} for {author_name} (user_id: {user_id})")
187+
else:
188+
self.logger.info(f"retrieving page {page} for {author_name}")
189+
current_page_articles = self.__post("/articles/search", params=params, use_cache=use_cache)
190+
page += 1
191+
if len(current_page_articles) == 0:
192+
break
193+
articles += current_page_articles
194+
195+
if user_id:
196+
self.logger.info(f"found {len(articles)} articles for {author_name} (user_id: {user_id})")
197+
else:
198+
self.logger.info(f"found {len(articles)} articles for {author_name}")
199+
200+
return articles
201+
143202
def articles_by_user_name(self, user_name, use_cache=True):
203+
"""Search for articles by author name without additional filtering.
204+
205+
This is a simpler version of articles_by_author() without institution
206+
filtering or user_id tracking. Use articles_by_author() for more precise
207+
searches when institution_id is available.
208+
209+
Args:
210+
user_name: The author's full name to search for
211+
use_cache: Whether to use cached results (default: True)
212+
213+
Returns:
214+
List of article dictionaries matching the author name
215+
"""
144216
params = self.__init_params()
145217
params["search_for"] = f':author: \"{user_name}\"'
146218
page = 1
@@ -159,3 +231,17 @@ def articles_by_user_name(self, user_name, use_cache=True):
159231

160232
def get_article(self, article_id, use_cache=True):
161233
return self.__get(f"/articles/{article_id}", use_cache=use_cache)
234+
235+
def search_authors(self, params, use_cache=True):
236+
"""Search for authors using the Figshare account API.
237+
238+
Args:
239+
params: Dictionary with search parameters (search, orcid, is_active,
240+
is_public, group_id, institution_id)
241+
use_cache: Whether to use cached results
242+
243+
Returns:
244+
List of author dictionaries matching the search criteria
245+
"""
246+
self.logger.info(f"Searching for authors with params: {params}")
247+
return self.__post("/account/authors/search", params=params, use_cache=use_cache)

0 commit comments

Comments
 (0)