@@ -421,7 +421,7 @@ def enrich_metadata_with_base(self, params: Dict[str, str], metadata: pd.DataFra
421421
422422 # run only if loglevel is debug, otherwise it is too expensive and we don't want it on production
423423 if self .logger .isEnabledFor (logging .DEBUG ):
424- self ._log_dataframe (metadata , params , '_original' )
424+ self ._log_dataframe (metadata . sort_values ( by = 'title' ) , params , '_original' )
425425
426426 raw_dois = metadata ["doi" ].tolist ()
427427 dois = [doi for doi in raw_dois if doi and pd .notna (doi )]
@@ -430,8 +430,7 @@ def enrich_metadata_with_base(self, params: Dict[str, str], metadata: pd.DataFra
430430 base_metadata = self .request_base_metadata (dois_for_base_query , params )
431431
432432 if self .logger .isEnabledFor (logging .DEBUG ):
433- self ._log_dataframe (base_metadata , params , 'base_metadata_raw' )
434-
433+ self ._log_dataframe (base_metadata .sort_values (by = 'title' ), params , 'base_metadata_raw' )
435434 # dataframe
436435 # paper, doi= "10.17169/refubium-48053; 10.1371/journal.pone.0311918"
437436 # 1. step: split on "; " -> ["10.17169/refubium-48053", "10.1371/journal.pone.0311918"]
@@ -459,14 +458,9 @@ def enrich_metadata_with_base(self, params: Dict[str, str], metadata: pd.DataFra
459458 base_metadata = self ._match_dois_by_version (base_metadata , dois )
460459
461460 base_metadata = base_metadata [base_metadata ['doi' ].isin (dois )]
462- # Sort by oa_state priority (1=open > 0=restricted > 2=unknown) so the
463- # most open record is kept when deduplicating by DOI.
464- oa_state_order = {1 : 0 , 0 : 1 , 2 : 2 }
465- base_metadata = base_metadata .assign (
466- _oa_sort = base_metadata ['oa_state' ].map (oa_state_order )
467- ).sort_values (by = '_oa_sort' ).drop_duplicates (subset = 'doi' , keep = 'first' ).drop (columns = '_oa_sort' )
461+ base_metadata = base_metadata .drop_duplicates (subset = 'doi' , keep = 'first' )
468462 if self .logger .isEnabledFor (logging .DEBUG ):
469- self ._log_dataframe (base_metadata , params , 'base_metadata' )
463+ self ._log_dataframe (base_metadata . sort_values ( by = 'title' ) , params , 'base_metadata' )
470464
471465 # Select and rename relevant fields from base_metadata, including subject_orig
472466 fields_to_merge = {
@@ -519,7 +513,7 @@ def custom_merge(existing_value, new_value):
519513 enriched_metadata .drop (columns = ['paper_abstract_base' , 'subject_orig_base' , 'subject_base' , 'oa_state_base' , 'link_base' , 'relation_base' ], inplace = True )
520514
521515 if self .logger .isEnabledFor (logging .DEBUG ):
522- self ._log_dataframe (enriched_metadata , params , '_enriched' )
516+ self ._log_dataframe (enriched_metadata . sort_values ( by = 'title' ) , params , '_enriched' )
523517
524518 # temporal solution, for some reason if we have some undefined data, dataprocessing is failing
525519 enriched_metadata = enriched_metadata .reindex (columns = list (set (original_columns + ['oa_state' , 'subject' , 'subject_orig' , 'paper_abstract' , 'link' , 'relation' ])))
0 commit comments