|
| 1 | +import click |
| 2 | +import logging |
| 3 | +import os |
| 4 | +import warnings |
| 5 | + |
| 6 | +import atom3.pair as pa |
| 7 | +import numpy as np |
| 8 | +import pandas as pd |
| 9 | + |
| 10 | +from Bio import BiopythonWarning |
| 11 | +from Bio.PDB import NeighborSearch |
| 12 | +from Bio.PDB import PDBParser |
| 13 | +from pathlib import Path |
| 14 | +from tqdm import tqdm |
| 15 | + |
| 16 | +from project.utils.utils import download_pdb_file, gunzip_file |
| 17 | + |
| 18 | + |
| 19 | +@click.command() |
| 20 | +@click.argument('output_dir', default='../DIPS/final/raw', type=click.Path()) |
| 21 | +@click.option('--source_type', default='rcsb', type=click.Choice(['rcsb', 'db5'])) |
| 22 | +@click.option('--interfacing_water_distance_cutoff', default=10.0, type=float) |
| 23 | +def main(output_dir: str, source_type: str, interfacing_water_distance_cutoff: float): |
| 24 | + logger = logging.getLogger(__name__) |
| 25 | + logger.info("Analyzing interface waters within each dataset example...") |
| 26 | + |
| 27 | + if source_type.lower() == "rcsb": |
| 28 | + parser = PDBParser() |
| 29 | + |
| 30 | + # Filter and suppress BioPython warnings |
| 31 | + warnings.filterwarnings("ignore", category=BiopythonWarning) |
| 32 | + |
| 33 | + # Collect (and, if necessary, extract) all training PDB files |
| 34 | + train_num_complexes = 0 |
| 35 | + train_complex_num_waters = 0 |
| 36 | + pairs_postprocessed_train_txt = os.path.join(output_dir, 'pairs-postprocessed-train-before-structure-based-filtering.txt') |
| 37 | + assert os.path.exists(pairs_postprocessed_train_txt), "DIPS-Plus train filenames must be curated in advance." |
| 38 | + with open(pairs_postprocessed_train_txt, "r") as f: |
| 39 | + train_filenames = [line.strip() for line in f.readlines()] |
| 40 | + for train_filename in tqdm(train_filenames): |
| 41 | + try: |
| 42 | + postprocessed_train_pair: pa.Pair = pd.read_pickle(os.path.join(output_dir, train_filename)) |
| 43 | + except Exception as e: |
| 44 | + logging.error(f"Could not open postprocessed training pair {os.path.join(output_dir, train_filename)} due to: {e}") |
| 45 | + continue |
| 46 | + pdb_code = postprocessed_train_pair.df0.pdb_name[0].split("_")[0][1:3] |
| 47 | + pdb_dir = os.path.join(Path(output_dir).parent.parent, "raw", "pdb", pdb_code) |
| 48 | + l_b_pdb_filepath = os.path.join(pdb_dir, postprocessed_train_pair.df0.pdb_name[0]) |
| 49 | + r_b_pdb_filepath = os.path.join(pdb_dir, postprocessed_train_pair.df1.pdb_name[0]) |
| 50 | + l_b_df0_chains = postprocessed_train_pair.df0.chain.unique() |
| 51 | + r_b_df1_chains = postprocessed_train_pair.df1.chain.unique() |
| 52 | + assert ( |
| 53 | + len(postprocessed_train_pair.df0.pdb_name.unique()) == len(l_b_df0_chains) == 1 |
| 54 | + ), "Only a single PDB filename and chain identifier can be associated with a single training example." |
| 55 | + assert ( |
| 56 | + len(postprocessed_train_pair.df1.pdb_name.unique()) == len(r_b_df1_chains) == 1 |
| 57 | + ), "Only a single PDB filename and chain identifier can be associated with a single training example." |
| 58 | + if not os.path.exists(l_b_pdb_filepath) and os.path.exists(l_b_pdb_filepath + ".gz"): |
| 59 | + gunzip_file(l_b_pdb_filepath) |
| 60 | + if not os.path.exists(r_b_pdb_filepath) and os.path.exists(r_b_pdb_filepath + ".gz"): |
| 61 | + gunzip_file(r_b_pdb_filepath) |
| 62 | + if not os.path.exists(l_b_pdb_filepath): |
| 63 | + download_pdb_file(os.path.basename(l_b_pdb_filepath), l_b_pdb_filepath) |
| 64 | + if not os.path.exists(r_b_pdb_filepath): |
| 65 | + download_pdb_file(os.path.basename(r_b_pdb_filepath), r_b_pdb_filepath) |
| 66 | + assert os.path.exists(l_b_pdb_filepath) and os.path.exists(r_b_pdb_filepath), "Both left and right-bound PDB files collected must exist." |
| 67 | + |
| 68 | + l_b_structure = parser.get_structure('protein', l_b_pdb_filepath) |
| 69 | + r_b_structure = parser.get_structure('protein', r_b_pdb_filepath) |
| 70 | + |
| 71 | + l_b_interface_residues = postprocessed_train_pair.df0[postprocessed_train_pair.df0.index.isin(postprocessed_train_pair.pos_idx[:, 0])] |
| 72 | + r_b_interface_residues = postprocessed_train_pair.df1[postprocessed_train_pair.df1.index.isin(postprocessed_train_pair.pos_idx[:, 1])] |
| 73 | + |
| 74 | + train_num_complexes += 1 |
| 75 | + |
| 76 | + l_b_ns = NeighborSearch(list(l_b_structure.get_atoms())) |
| 77 | + for index, row in l_b_interface_residues.iterrows(): |
| 78 | + chain_id = row['chain'] |
| 79 | + residue = row['residue'].strip() |
| 80 | + model = l_b_structure[0] |
| 81 | + chain = model[chain_id] |
| 82 | + if residue.lstrip("-").isdigit(): |
| 83 | + residue = int(residue) |
| 84 | + else: |
| 85 | + residue_index, residue_icode = residue[:-1], residue[-1:] |
| 86 | + if residue_icode.strip() == "": |
| 87 | + residue = int(residue) |
| 88 | + else: |
| 89 | + residue = (" ", int(residue_index), residue_icode) |
| 90 | + target_residue = chain[residue] |
| 91 | + target_coords = np.array([atom.get_coord() for atom in target_residue.get_atoms() if atom.get_name() == 'CA']).squeeze() |
| 92 | + interfacing_atoms = l_b_ns.search(target_coords, interfacing_water_distance_cutoff, 'A') |
| 93 | + waters_within_threshold = [atom for atom in interfacing_atoms if atom.get_parent().get_resname() in ['HOH', 'WAT']] |
| 94 | + train_complex_num_waters += len(waters_within_threshold) |
| 95 | + |
| 96 | + r_b_ns = NeighborSearch(list(r_b_structure.get_atoms())) |
| 97 | + for index, row in r_b_interface_residues.iterrows(): |
| 98 | + chain_id = row['chain'] |
| 99 | + residue = row['residue'].strip() |
| 100 | + model = r_b_structure[0] |
| 101 | + chain = model[chain_id] |
| 102 | + if residue.lstrip("-").isdigit(): |
| 103 | + residue = int(residue) |
| 104 | + else: |
| 105 | + residue_index, residue_icode = residue[:-1], residue[-1:] |
| 106 | + residue = (" ", int(residue_index), residue_icode) |
| 107 | + target_residue = chain[residue] |
| 108 | + target_coords = np.array([atom.get_coord() for atom in target_residue.get_atoms() if atom.get_name() == 'CA']).squeeze() |
| 109 | + interfacing_atoms = r_b_ns.search(target_coords, interfacing_water_distance_cutoff, 'A') |
| 110 | + waters_within_threshold = [atom for atom in interfacing_atoms if atom.get_parent().get_resname() in ['HOH', 'WAT']] |
| 111 | + train_complex_num_waters += len(waters_within_threshold) |
| 112 | + |
| 113 | + # Collect (and, if necessary, extract) all validation PDB files |
| 114 | + val_num_complexes = 0 |
| 115 | + val_complex_num_waters = 0 |
| 116 | + pairs_postprocessed_val_txt = os.path.join(output_dir, 'pairs-postprocessed-val-before-structure-based-filtering.txt') |
| 117 | + assert os.path.exists(pairs_postprocessed_val_txt), "DIPS-Plus validation filenames must be curated in advance." |
| 118 | + with open(pairs_postprocessed_val_txt, "r") as f: |
| 119 | + val_filenames = [line.strip() for line in f.readlines()] |
| 120 | + for val_filename in tqdm(val_filenames): |
| 121 | + try: |
| 122 | + postprocessed_val_pair: pa.Pair = pd.read_pickle(os.path.join(output_dir, val_filename)) |
| 123 | + except Exception as e: |
| 124 | + logging.error(f"Could not open postprocessed validation pair {os.path.join(output_dir, val_filename)} due to: {e}") |
| 125 | + continue |
| 126 | + pdb_code = postprocessed_val_pair.df0.pdb_name[0].split("_")[0][1:3] |
| 127 | + pdb_dir = os.path.join(Path(output_dir).parent.parent, "raw", "pdb", pdb_code) |
| 128 | + l_b_pdb_filepath = os.path.join(pdb_dir, postprocessed_val_pair.df0.pdb_name[0]) |
| 129 | + r_b_pdb_filepath = os.path.join(pdb_dir, postprocessed_val_pair.df1.pdb_name[0]) |
| 130 | + l_b_df0_chains = postprocessed_val_pair.df0.chain.unique() |
| 131 | + r_b_df1_chains = postprocessed_val_pair.df1.chain.unique() |
| 132 | + assert ( |
| 133 | + len(postprocessed_val_pair.df0.pdb_name.unique()) == len(l_b_df0_chains) == 1 |
| 134 | + ), "Only a single PDB filename and chain identifier can be associated with a single validation example." |
| 135 | + assert ( |
| 136 | + len(postprocessed_val_pair.df1.pdb_name.unique()) == len(r_b_df1_chains) == 1 |
| 137 | + ), "Only a single PDB filename and chain identifier can be associated with a single validation example." |
| 138 | + if not os.path.exists(l_b_pdb_filepath) and os.path.exists(l_b_pdb_filepath + ".gz"): |
| 139 | + gunzip_file(l_b_pdb_filepath) |
| 140 | + if not os.path.exists(r_b_pdb_filepath) and os.path.exists(r_b_pdb_filepath + ".gz"): |
| 141 | + gunzip_file(r_b_pdb_filepath) |
| 142 | + if not os.path.exists(l_b_pdb_filepath): |
| 143 | + download_pdb_file(os.path.basename(l_b_pdb_filepath), l_b_pdb_filepath) |
| 144 | + if not os.path.exists(r_b_pdb_filepath): |
| 145 | + download_pdb_file(os.path.basename(r_b_pdb_filepath), r_b_pdb_filepath) |
| 146 | + assert os.path.exists(l_b_pdb_filepath) and os.path.exists(r_b_pdb_filepath), "Both left and right-bound PDB files collected must exist." |
| 147 | + |
| 148 | + l_b_structure = parser.get_structure('protein', l_b_pdb_filepath) |
| 149 | + r_b_structure = parser.get_structure('protein', r_b_pdb_filepath) |
| 150 | + |
| 151 | + l_b_interface_residues = postprocessed_val_pair.df0[postprocessed_val_pair.df0.index.isin(postprocessed_val_pair.pos_idx[:, 0])] |
| 152 | + r_b_interface_residues = postprocessed_val_pair.df1[postprocessed_val_pair.df1.index.isin(postprocessed_val_pair.pos_idx[:, 1])] |
| 153 | + |
| 154 | + val_num_complexes += 1 |
| 155 | + |
| 156 | + l_b_ns = NeighborSearch(list(l_b_structure.get_atoms())) |
| 157 | + for index, row in l_b_interface_residues.iterrows(): |
| 158 | + chain_id = row['chain'] |
| 159 | + residue = row['residue'].strip() |
| 160 | + model = l_b_structure[0] |
| 161 | + chain = model[chain_id] |
| 162 | + if residue.lstrip("-").isdigit(): |
| 163 | + residue = int(residue) |
| 164 | + else: |
| 165 | + residue_index, residue_icode = residue[:-1], residue[-1:] |
| 166 | + residue = (" ", int(residue_index), residue_icode) |
| 167 | + target_residue = chain[residue] |
| 168 | + target_coords = np.array([atom.get_coord() for atom in target_residue.get_atoms() if atom.get_name() == 'CA']).squeeze() |
| 169 | + interfacing_atoms = l_b_ns.search(target_coords, interfacing_water_distance_cutoff, 'A') |
| 170 | + waters_within_threshold = [atom for atom in interfacing_atoms if atom.get_parent().get_resname() in ['HOH', 'WAT']] |
| 171 | + val_complex_num_waters += len(waters_within_threshold) |
| 172 | + |
| 173 | + r_b_ns = NeighborSearch(list(r_b_structure.get_atoms())) |
| 174 | + for index, row in r_b_interface_residues.iterrows(): |
| 175 | + chain_id = row['chain'] |
| 176 | + residue = row['residue'].strip() |
| 177 | + model = r_b_structure[0] |
| 178 | + chain = model[chain_id] |
| 179 | + if residue.lstrip("-").isdigit(): |
| 180 | + residue = int(residue) |
| 181 | + else: |
| 182 | + residue_index, residue_icode = residue[:-1], residue[-1:] |
| 183 | + residue = (" ", int(residue_index), residue_icode) |
| 184 | + target_residue = chain[residue] |
| 185 | + target_coords = np.array([atom.get_coord() for atom in target_residue.get_atoms() if atom.get_name() == 'CA']).squeeze() |
| 186 | + interfacing_atoms = r_b_ns.search(target_coords, interfacing_water_distance_cutoff, 'A') |
| 187 | + waters_within_threshold = [atom for atom in interfacing_atoms if atom.get_parent().get_resname() in ['HOH', 'WAT']] |
| 188 | + val_complex_num_waters += len(waters_within_threshold) |
| 189 | + |
| 190 | + # Train complexes |
| 191 | + train_num_waters_per_complex = train_complex_num_waters / train_num_complexes |
| 192 | + logging.info(f"Number of waters, on average, in each training complex: {train_num_waters_per_complex}") |
| 193 | + |
| 194 | + # Validation complexes |
| 195 | + val_num_waters_per_complex = val_complex_num_waters / val_num_complexes |
| 196 | + logging.info(f"Number of waters, on average, in each validation complex: {val_num_waters_per_complex}") |
| 197 | + |
| 198 | + # Train + Validation complexes |
| 199 | + train_val_num_waters_per_complex = (train_complex_num_waters + val_complex_num_waters) / (train_num_complexes + val_num_complexes) |
| 200 | + logging.info(f"Number of waters, on average, in each training (or validation) complex: {train_val_num_waters_per_complex}") |
| 201 | + |
| 202 | + logger.info("Finished analyzing interface waters for all training and validation complexes") |
| 203 | + |
| 204 | + else: |
| 205 | + raise NotImplementedError(f"Source type {source_type} is currently not supported.") |
| 206 | + |
| 207 | + |
| 208 | +if __name__ == "__main__": |
| 209 | + log_fmt = '%(asctime)s %(levelname)s %(process)d: %(message)s' |
| 210 | + logging.basicConfig(level=logging.INFO, format=log_fmt) |
| 211 | + |
| 212 | + main() |
0 commit comments