Skip to content

Commit c56cece

Browse files
author
WorkBuddy
committed
fix(pubmed): fix ESummary/EFetch data shape mismatches
- utils: extractAuthors now handles ESummary {name,authtype} and EFetch {lastname,initials} formats - utils: formatArticleType now handles ESummary string[] pubtype format - utils: buildEutilsUrl allows callers to override default retmode=json - article: switch from ESummary to EFetch XML for full abstract, MeSH terms, keywords, and author affiliations - related: switch to neighbor_score cmd to get similarity scores; normalize links to {id,score} objects; filter out source PMID Fixes issues raised by Astro-Han in code review
1 parent df15093 commit c56cece

File tree

3 files changed

+198
-136
lines changed

3 files changed

+198
-136
lines changed

clis/pubmed/article.ts

Lines changed: 156 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -2,30 +2,148 @@
22
* PubMed Article Details Adapter
33
*
44
* Get detailed information about a specific PubMed article by PMID.
5-
* Uses ESummary API to retrieve metadata (ESummary returns JSON, EFetch returns XML).
5+
* Uses EFetch API (XML) for full article details including abstract,
6+
* MeSH terms, keywords, and author affiliations.
67
*
78
* API Documentation:
8-
* - ESummary: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESummary
9+
* - EFetch: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
910
*/
1011

1112
import { cli, Strategy } from '@jackwener/opencli/registry';
1213
import { CliError } from '@jackwener/opencli/errors';
1314
import {
14-
eutilsFetch,
15-
extractAuthors,
16-
extractFirstAuthor,
17-
extractCorrespondingAuthor,
18-
extractDoi,
19-
extractPmcId,
15+
eutilsFetchText,
2016
buildPubMedUrl,
2117
truncateText,
22-
formatArticleType,
2318
} from './utils.js';
2419

20+
/**
21+
* Parse EFetch XML response to extract full article details
22+
*/
23+
function parseEFetchXml(xml: string, pmid: string) {
24+
// Helper: extract text content between tags
25+
const getTag = (src: string, tag: string): string => {
26+
const m = src.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)<\/${tag}>`, 'i'));
27+
return m ? m[1].replace(/<[^>]+>/g, '').trim() : '';
28+
};
29+
30+
const getAllTags = (src: string, tag: string): string[] => {
31+
const re = new RegExp(`<${tag}[^>]*>([\\s\\S]*?)<\/${tag}>`, 'gi');
32+
const results: string[] = [];
33+
let m;
34+
while ((m = re.exec(src)) !== null) {
35+
results.push(m[1].replace(/<[^>]+>/g, '').trim());
36+
}
37+
return results;
38+
};
39+
40+
// Abstract - may have multiple AbstractText sections (structured abstract)
41+
const abstractParts = getAllTags(xml, 'AbstractText');
42+
const abstract = abstractParts.join(' ').replace(/\s+/g, ' ').trim();
43+
44+
// Title
45+
const title = getTag(xml, 'ArticleTitle');
46+
47+
// Journal
48+
const journalTitle = getTag(xml, 'Title');
49+
const isoAbbreviation = getTag(xml, 'ISOAbbreviation');
50+
const volume = getTag(xml, 'Volume');
51+
const issue = getTag(xml, 'Issue');
52+
const pagination = getTag(xml, 'MedlinePgn');
53+
54+
// Publication date
55+
const year = getTag(xml, 'Year') || getTag(xml, 'MedlineDate').slice(0, 4);
56+
const month = getTag(xml, 'Month');
57+
const day = getTag(xml, 'Day');
58+
const fullDate = [year, month, day].filter(Boolean).join(' ');
59+
60+
// Authors and affiliations
61+
const authorBlocks = xml.match(/<Author[^>]*>([\s\S]*?)<\/Author>/gi) || [];
62+
const authors: Array<{ name: string; affiliation: string }> = authorBlocks.map(block => {
63+
const lastName = getTag(block, 'LastName');
64+
const foreName = getTag(block, 'ForeName') || getTag(block, 'Initials');
65+
const collectiveName = getTag(block, 'CollectiveName');
66+
const name = collectiveName || `${lastName} ${foreName}`.trim();
67+
const affiliation = getTag(block, 'Affiliation');
68+
return { name, affiliation };
69+
});
70+
71+
const allAuthors = authors.map(a => a.name);
72+
const firstAuthor = allAuthors[0] || '';
73+
const correspondingAuthor = allAuthors[allAuthors.length - 1] || '';
74+
75+
// Unique affiliations
76+
const affiliations = [...new Set(
77+
authors.map(a => a.affiliation).filter(Boolean)
78+
)];
79+
80+
// MeSH terms
81+
const meshBlocks = xml.match(/<MeshHeading>([\s\S]*?)<\/MeshHeading>/gi) || [];
82+
const meshTerms = meshBlocks
83+
.map(block => getTag(block, 'DescriptorName'))
84+
.filter(Boolean)
85+
.slice(0, 10);
86+
87+
// Keywords
88+
const keywords = getAllTags(xml, 'Keyword').filter(Boolean).slice(0, 10);
89+
90+
// Article type
91+
const pubTypes = getAllTags(xml, 'PublicationType').filter(Boolean);
92+
const articleType = pubTypes[0] || 'Journal Article';
93+
94+
// Language
95+
const language = getTag(xml, 'Language');
96+
97+
// IDs: DOI
98+
const doiMatch = xml.match(/<ArticleId IdType="doi">([^<]+)<\/ArticleId>/i);
99+
const doi = doiMatch ? doiMatch[1].trim() : '';
100+
101+
const pmcMatch = xml.match(/<ArticleId IdType="pmc">([^<]+)<\/ArticleId>/i);
102+
const pmcId = pmcMatch ? pmcMatch[1].trim() : '';
103+
104+
return {
105+
pmid,
106+
title,
107+
abstract,
108+
authors: {
109+
list: allAuthors,
110+
all: allAuthors.slice(0, 10).join(', ') + (allAuthors.length > 10 ? ', et al.' : ''),
111+
first: firstAuthor,
112+
corresponding: correspondingAuthor,
113+
count: allAuthors.length,
114+
affiliations,
115+
},
116+
journal: {
117+
title: journalTitle,
118+
isoAbbreviation,
119+
volume,
120+
issue,
121+
pagination,
122+
},
123+
publication: {
124+
year,
125+
fullDate,
126+
},
127+
ids: {
128+
pmid,
129+
doi,
130+
pmc: pmcId,
131+
},
132+
classification: {
133+
articleType,
134+
pubTypes,
135+
language,
136+
meshTerms,
137+
keywords,
138+
},
139+
url: buildPubMedUrl(pmid),
140+
};
141+
}
142+
25143
cli({
26144
site: 'pubmed',
27145
name: 'article',
28-
description: 'Get detailed information about a PubMed article by PMID',
146+
description: 'Get detailed information about a PubMed article by PMID (full abstract, MeSH terms, affiliations)',
29147
strategy: Strategy.PUBLIC,
30148
browser: false,
31149
args: [
@@ -34,7 +152,7 @@ cli({
34152
type: 'string',
35153
required: true,
36154
positional: true,
37-
help: 'PubMed ID (e.g., "37780221", "37158692")',
155+
help: 'PubMed ID (e.g., "37780221")',
38156
},
39157
{
40158
name: 'output',
@@ -43,14 +161,10 @@ cli({
43161
help: 'Output format: table (summary) or json (full details)',
44162
},
45163
],
46-
columns: [
47-
'field',
48-
'value',
49-
],
164+
columns: ['field', 'value'],
50165
func: async (_page, args) => {
51166
const pmid = args.pmid.trim();
52167

53-
// Validate PMID format
54168
if (!/^\d+$/.test(pmid)) {
55169
throw new CliError(
56170
'INVALID_ARGUMENT',
@@ -59,125 +173,52 @@ cli({
59173
);
60174
}
61175

62-
// Use ESummary to get article details (returns JSON, unlike EFetch which returns XML)
63-
const esummaryResult = await eutilsFetch('esummary', {
176+
// Use EFetch to get full article details (XML includes abstract, MeSH, affiliations)
177+
const xml = await eutilsFetchText('efetch', {
64178
id: pmid,
179+
rettype: 'abstract',
180+
retmode: 'xml',
65181
});
66182

67-
const article = esummaryResult.result?.[pmid];
68-
if (!article) {
183+
if (!xml || xml.includes('<ERROR>') || !xml.includes('<PubmedArticle>')) {
69184
throw new CliError(
70185
'NOT_FOUND',
71186
`Article with PMID ${pmid} not found`,
72187
'Check the PMID and try again'
73188
);
74189
}
75190

76-
// Extract basic info
77-
const title = article.title || '';
78-
const abstract = article.abstract || '';
79-
const abstractText = typeof abstract === 'string' ? abstract : '';
191+
const article = parseEFetchXml(xml, pmid);
80192

81-
// Extract authors
82-
const authorList = article.authors || [];
83-
const allAuthors = extractAuthors(authorList, 10);
84-
const firstAuthor = extractFirstAuthor(authorList);
85-
const correspondingAuthor = extractCorrespondingAuthor(authorList);
86-
87-
// Extract journal info
88-
const journalTitle = article.fulljournalname || article.source || '';
89-
const isoAbbreviation = article.source || '';
90-
91-
// Extract publication date
92-
const pubDate = article.pubdate || '';
93-
const year = pubDate.split(' ')[0] || '';
94-
const fullDate = pubDate;
95-
96-
// Extract volume, issue, pages
97-
const volume = article.volume || '';
98-
const issue = article.issue || '';
99-
const pagination = article.pages || '';
100-
101-
// Extract article IDs
102-
const articleIds = article.articleids || [];
103-
const doi = extractDoi(articleIds);
104-
const pmcId = extractPmcId(articleIds);
105-
106-
// Extract MeSH terms and keywords (from ESummary these may not be available)
107-
const meshTerms: string[] = [];
108-
const keywords: string[] = [];
109-
110-
// Extract article type
111-
const pubTypeList = article.pubtype || [];
112-
const articleType = formatArticleType(pubTypeList);
113-
114-
// Extract language
115-
const language = article.lang?.[0] || '';
116-
117-
// If JSON format requested, return full structured data
118193
if (args.output === 'json') {
119194
return [{
120195
field: 'data',
121-
value: JSON.stringify({
122-
pmid,
123-
title,
124-
abstract: abstractText,
125-
authors: {
126-
all: allAuthors,
127-
first: firstAuthor,
128-
corresponding: correspondingAuthor,
129-
count: authorList?.length || 0,
130-
},
131-
journal: {
132-
title: journalTitle,
133-
isoAbbreviation,
134-
volume,
135-
issue,
136-
pagination,
137-
},
138-
publication: {
139-
year,
140-
fullDate,
141-
},
142-
ids: {
143-
pmid,
144-
doi,
145-
pmc: pmcId,
146-
},
147-
classification: {
148-
articleType,
149-
language,
150-
meshTerms,
151-
keywords,
152-
},
153-
url: buildPubMedUrl(pmid),
154-
}, null, 2),
196+
value: JSON.stringify(article, null, 2),
155197
}];
156198
}
157199

158-
// Table format - return key-value pairs
200+
// Table format
159201
const rows: Array<{ field: string; value: string }> = [
160-
{ field: 'PMID', value: pmid },
161-
{ field: 'Title', value: title },
162-
{ field: 'First Author', value: firstAuthor },
163-
{ field: 'Corresponding Author', value: correspondingAuthor },
164-
{ field: 'All Authors', value: truncateText(allAuthors, 100) },
165-
{ field: 'Journal', value: journalTitle },
166-
{ field: 'Year', value: year },
167-
{ field: 'Volume/Issue', value: `${volume}${issue ? `(${issue})` : ''}` },
168-
{ field: 'Pages', value: pagination },
169-
{ field: 'DOI', value: doi || 'N/A' },
170-
{ field: 'PMC ID', value: pmcId || 'N/A' },
171-
{ field: 'Article Type', value: articleType },
172-
{ field: 'Language', value: language },
173-
{ field: 'MeSH Terms', value: meshTerms.join(', ') || 'N/A' },
174-
{ field: 'Keywords', value: keywords.join(', ') || 'N/A' },
175-
{ field: 'Abstract', value: truncateText(abstractText, 300) || 'N/A' },
176-
{ field: 'URL', value: buildPubMedUrl(pmid) },
202+
{ field: 'PMID', value: article.pmid },
203+
{ field: 'Title', value: article.title },
204+
{ field: 'First Author', value: article.authors.first },
205+
{ field: 'Corresponding Author', value: article.authors.corresponding },
206+
{ field: 'All Authors', value: truncateText(article.authors.all, 120) },
207+
{ field: 'Affiliations', value: truncateText(article.authors.affiliations[0] || 'N/A', 120) },
208+
{ field: 'Journal', value: article.journal.title || article.journal.isoAbbreviation },
209+
{ field: 'Year', value: article.publication.year },
210+
{ field: 'Volume/Issue', value: `${article.journal.volume}${article.journal.issue ? `(${article.journal.issue})` : ''}` },
211+
{ field: 'Pages', value: article.journal.pagination },
212+
{ field: 'DOI', value: article.ids.doi || 'N/A' },
213+
{ field: 'PMC ID', value: article.ids.pmc || 'N/A' },
214+
{ field: 'Article Type', value: article.classification.articleType },
215+
{ field: 'Language', value: article.classification.language },
216+
{ field: 'MeSH Terms', value: article.classification.meshTerms.join(', ') || 'N/A' },
217+
{ field: 'Keywords', value: article.classification.keywords.join(', ') || 'N/A' },
218+
{ field: 'Abstract', value: truncateText(article.abstract, 400) || 'N/A' },
219+
{ field: 'URL', value: article.url },
177220
];
178221

179-
180-
181222
return rows;
182223
},
183224
});

0 commit comments

Comments
 (0)