Skip to content

Commit 8d38836

Browse files
committed
revert to original approach
1 parent 3b62fed commit 8d38836

1 file changed

Lines changed: 5 additions & 11 deletions

File tree

server/workers/orcid/src/orcid_service.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ def enrich_metadata_with_base(self, params: Dict[str, str], metadata: pd.DataFra
421421

422422
# run only if loglevel is debug, otherwise it is too expensive and we don't want it on production
423423
if self.logger.isEnabledFor(logging.DEBUG):
424-
self._log_dataframe(metadata, params, '_original')
424+
self._log_dataframe(metadata.sort_values(by='title'), params, '_original')
425425

426426
raw_dois = metadata["doi"].tolist()
427427
dois = [doi for doi in raw_dois if doi and pd.notna(doi)]
@@ -430,8 +430,7 @@ def enrich_metadata_with_base(self, params: Dict[str, str], metadata: pd.DataFra
430430
base_metadata = self.request_base_metadata(dois_for_base_query, params)
431431

432432
if self.logger.isEnabledFor(logging.DEBUG):
433-
self._log_dataframe(base_metadata, params, 'base_metadata_raw')
434-
433+
self._log_dataframe(base_metadata.sort_values(by='title'), params, 'base_metadata_raw')
435434
# dataframe
436435
# paper, doi= "10.17169/refubium-48053; 10.1371/journal.pone.0311918"
437436
# 1. step: split on "; " -> ["10.17169/refubium-48053", "10.1371/journal.pone.0311918"]
@@ -459,14 +458,9 @@ def enrich_metadata_with_base(self, params: Dict[str, str], metadata: pd.DataFra
459458
base_metadata = self._match_dois_by_version(base_metadata, dois)
460459

461460
base_metadata = base_metadata[base_metadata['doi'].isin(dois)]
462-
# Sort by oa_state priority (1=open > 0=restricted > 2=unknown) so the
463-
# most open record is kept when deduplicating by DOI.
464-
oa_state_order = {1: 0, 0: 1, 2: 2}
465-
base_metadata = base_metadata.assign(
466-
_oa_sort=base_metadata['oa_state'].map(oa_state_order)
467-
).sort_values(by='_oa_sort').drop_duplicates(subset='doi', keep='first').drop(columns='_oa_sort')
461+
base_metadata = base_metadata.drop_duplicates(subset='doi', keep='first')
468462
if self.logger.isEnabledFor(logging.DEBUG):
469-
self._log_dataframe(base_metadata, params, 'base_metadata')
463+
self._log_dataframe(base_metadata.sort_values(by='title'), params, 'base_metadata')
470464

471465
# Select and rename relevant fields from base_metadata, including subject_orig
472466
fields_to_merge = {
@@ -519,7 +513,7 @@ def custom_merge(existing_value, new_value):
519513
enriched_metadata.drop(columns=['paper_abstract_base', 'subject_orig_base', 'subject_base', 'oa_state_base', 'link_base', 'relation_base'], inplace=True)
520514

521515
if self.logger.isEnabledFor(logging.DEBUG):
522-
self._log_dataframe(enriched_metadata, params, '_enriched')
516+
self._log_dataframe(enriched_metadata.sort_values(by='title'), params, '_enriched')
523517

524518
# temporal solution, for some reason if we have some undefined data, dataprocessing is failing
525519
enriched_metadata = enriched_metadata.reindex(columns=list(set(original_columns + ['oa_state', 'subject', 'subject_orig', 'paper_abstract', 'link', 'relation'])))

0 commit comments

Comments
 (0)