@@ -628,8 +628,8 @@ def recover_any_missing_chain_ids(interim_dataset_dir: str, new_pdb_filepath: st
628
628
orig_pdb_chain_id = '_' # Default value for missing chain IDs
629
629
new_pdb_code = db .get_pdb_code (new_pdb_filepath )
630
630
orig_pdb_name = db .get_pdb_name (orig_pdb_filepath )
631
- orig_pdb_df = PandasPdb ().read_pdb (new_pdb_filepath ). df [ 'ATOM' ]
632
- unique_chain_ids = np .unique (orig_pdb_df ['chain_id' ].values )
631
+ new_pdb_obj = PandasPdb ().read_pdb (new_pdb_filepath )
632
+ unique_chain_ids = np .unique (new_pdb_obj . df [ 'ATOM' ] ['chain_id' ].values )
633
633
634
634
"""Ascertain the chain ID corresponding to the original PDB file, using one of two available methods.
635
635
Method 1: Used with datasets such as EVCoupling adopting .atom filename extensions (e.g., 4DI3C.atom)
@@ -645,6 +645,13 @@ def recover_any_missing_chain_ids(interim_dataset_dir: str, new_pdb_filepath: st
645
645
# Assume the first/second index is the first non-empty chain ID (e.g., 'A')
646
646
orig_pdb_chain_id = unique_chain_ids [0 ] if (unique_chain_ids [0 ] != '' ) else unique_chain_ids [1 ]
647
647
648
+ # Update version of the input PDB file copied to input_dataset_dir
649
+ new_pdb_obj .df ['ATOM' ]['chain_id' ] = orig_pdb_chain_id
650
+ new_pdb_obj .df ['HETATM' ]['chain_id' ] = orig_pdb_chain_id
651
+ new_pdb_obj .df ['ANISOU' ]['chain_id' ] = orig_pdb_chain_id
652
+ new_pdb_obj .df ['OTHERS' ]['chain_id' ] = orig_pdb_chain_id
653
+ new_pdb_obj .to_pdb (new_pdb_filepath , records = None , gz = False , append_newline = True )
654
+
648
655
# Update existing parsed chains to contain the newly-recovered chain ID
649
656
parsed_dir = os .path .join (interim_dataset_dir , 'parsed' , pdb_code )
650
657
parsed_filenames = [
@@ -818,7 +825,7 @@ def convert_input_pdb_files_to_pair(left_pdb_filepath: str, right_pdb_filepath:
818
825
output_dir = os .path .join (input_dataset_dir , 'final' , 'raw' )
819
826
produced_filenames = db .get_structures_filenames (output_dir , extension = '.dill' )
820
827
produced_keys = [db .get_pdb_name (x ) for x in produced_filenames
821
- if db .get_pdb_code (x ).upper () in db .get_pdb_code (left_pdb_filepath ).upper ()]
828
+ if db .get_pdb_code (x ).upper () in db .get_pdb_code (new_l_u_filepath ).upper ()]
822
829
pair_filepath = [os .path .join (output_dir , db .get_pdb_code (key )[1 :3 ], key )
823
830
for key in produced_keys ][0 ]
824
831
# Impute any missing feature values in the postprocessed input pairs
0 commit comments