@@ -345,50 +345,6 @@ def get_unversioned_doi(doi_str):
345345
346346 return base_metadata
347347
348- def _explode_merged_dois (self , base_metadata : pd .DataFrame ) -> pd .DataFrame :
349- """
350- Explode merged_dois field to create separate rows for each DOI variant.
351-
352- If base_metadata contains a 'merged_dois' field with multiple DOIs separated by '; ',
353- this function creates separate rows for each DOI, allowing matching with ORCID metadata
354- by any of those DOIs.
355-
356- Parameters:
357- - base_metadata: DataFrame with BASE metadata, potentially containing 'merged_dois' field
358-
359- Returns:
360- - DataFrame with exploded rows where each row has a single DOI in the 'doi' column
361- """
362- # Process merged_dois: explode to create separate rows for each DOI variant
363- # This allows us to match BASE records with multiple DOIs to ORCID records by any of those DOIs
364- if 'merged_dois' in base_metadata .columns :
365- # Split merged_dois by "; " and create a list of DOIs for each row
366- # If merged_dois is empty/NaN, create empty list; otherwise split and process each DOI
367- base_metadata ['merged_dois_list' ] = base_metadata ['merged_dois' ].apply (
368- lambda x : [remove_doi_prefix (doi .strip ()) for doi in str (x ).split ('; ' ) if doi .strip ()]
369- if pd .notna (x ) and str (x ).strip () else []
370- )
371-
372- # Use explode to create separate rows for each DOI in merged_dois_list
373- # Rows with empty lists will remain as single rows
374- base_metadata = base_metadata .explode ('merged_dois_list' , ignore_index = True )
375-
376- # For rows where merged_dois_list is not empty, use it as the DOI
377- # For rows where merged_dois_list is empty/NaN, use the regular doi column
378- mask_has_merged_doi = pd .notna (base_metadata ['merged_dois_list' ]) & (base_metadata ['merged_dois_list' ] != '' )
379- base_metadata .loc [mask_has_merged_doi , 'doi' ] = base_metadata .loc [mask_has_merged_doi , 'merged_dois_list' ]
380-
381- # Process regular doi column for rows without merged_dois
382- base_metadata .loc [~ mask_has_merged_doi , 'doi' ] = base_metadata .loc [~ mask_has_merged_doi , 'doi' ].apply (remove_doi_prefix )
383-
384- # Drop temporary column
385- base_metadata = base_metadata .drop (columns = ['merged_dois_list' ])
386- else :
387- # No merged_dois column, process regular doi column
388- base_metadata .loc [:, 'doi' ] = base_metadata ['doi' ].apply (remove_doi_prefix )
389-
390- return base_metadata
391-
392348 def enrich_metadata_with_base (self , params : Dict [str , str ], metadata : pd .DataFrame ) -> pd .DataFrame :
393349 self .logger .debug (f"Enriching metadata with base for ORCID { params .get ('orcid' )} " )
394350
@@ -437,7 +393,6 @@ def enrich_metadata_with_base(self, params: Dict[str, str], metadata: pd.DataFra
437393
438394 base_metadata = base_metadata .reindex (columns = required_fields )
439395
440- #base_metadata = self._explode_merged_dois(base_metadata)
441396 base_metadata ['merged_dois' ] = base_metadata ['merged_dois' ].apply (lambda x : x [0 ] if isinstance (x , list ) and len (x ) > 0 else x )
442397 base_metadata ['merged_dois' ] = base_metadata ['merged_dois' ].apply (lambda x : x .split (';' ) if isinstance (x , str ) else [])
443398 base_metadata ['merged_dois' ] = base_metadata ['merged_dois' ].apply (lambda x : [x .strip () for x in x ] if isinstance (x , list ) else x )
0 commit comments