BioinfoMachineLearning
diff --git a/‎environment.yml
Lines changed: 42 additions & 7 deletions b/‎environment.yml
Lines changed: 42 additions & 7 deletions
diff --git a/‎project/datasets/analysis/analyze_experiment_types_and_resolution.py
Lines changed: 3 additions & 3 deletions b/‎project/datasets/analysis/analyze_experiment_types_and_resolution.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎project/datasets/analysis/analyze_feature_correlation.py
Lines changed: 2 additions & 2 deletions b/‎project/datasets/analysis/analyze_feature_correlation.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎project/datasets/analysis/analyze_interface_waters.py
Lines changed: 212 additions & 0 deletions b/‎project/datasets/analysis/analyze_interface_waters.py
Lines changed: 212 additions & 0 deletions
@@ -150,8 +150,6 @@ dependencies:
   - llvm-openmp=16.0.4=h4dfa4b3_0
   - lz4-c=1.9.4=h6a678d5_0
   - magma=2.6.2=hc72dce7_0
-  - matplotlib=3.3.2=0
-  - matplotlib-base=3.3.2=py38h5c7f4ab_1
   - matplotlib-inline=0.1.6=pyhd8ed1ab_0
   - mkl=2022.2.1=h84fe81f_16997
   - mpi=1.0=openmpi
@@ -164,8 +162,6 @@ dependencies:
   - ninja=1.11.1=h924138e_0
   - nsight-compute=2023.1.1.4=0
   - numexpr=2.8.4=py38hd2a5715_1
-  - numpy=1.24.3=py38hf838250_0
-  - numpy-base=1.24.3=py38h1e6e340_0
   - openh264=2.1.1=h780b84a_0
   - openjpeg=2.5.0=hfec8fc6_2
   - openmpi=4.1.5=h414af15_101
@@ -187,7 +183,6 @@ dependencies:
   - pycparser=2.21=pyhd3eb1b0_0
   - pygments=2.15.1=pyhd8ed1ab_0
   - pyopenssl=23.1.1=pyhd8ed1ab_0
-  - pyparsing=3.1.0=pyhd8ed1ab_0
   - pysocks=1.7.1=py38h06a4308_0
   - python=3.8.16=he550d4f_1_cpython
   - python-dateutil=2.8.2=pyhd3eb1b0_0
@@ -235,63 +230,103 @@ dependencies:
       - git+https://github.com/amorehead/atom3.git@83987404ceed38a1f5a5abd517aa38128d0a4f2c
       - attrs==23.1.0
       - babel==2.12.1
+      - beautifulsoup4==4.12.2
       - biopandas==0.5.0.dev0
+      - bioservices==1.11.2
       - cachetools==5.3.1
+      - cattrs==23.1.2
       - click==7.0
+      - colorlog==6.7.0
       - configparser==5.3.0
+      - contourpy==1.1.0
+      - deepdiff==6.3.1
       - dill==0.3.3
       - docker-pycreds==0.4.0
       - docutils==0.17.1
       - easy-parallel-py3==0.1.6.4
+      - easydev==0.12.1
+      - exceptiongroup==1.1.2
       - fairscale==0.4.0
+      - fonttools==4.40.0
       - frozenlist==1.3.3
       - fsspec==2023.5.0
       - future==0.18.3
+      - gevent==22.10.2
       - gitdb==4.0.10
       - gitpython==3.1.31
       - google-auth==2.19.0
       - google-auth-oauthlib==1.0.0
+      - git+https://github.com/a-r-j/graphein.git@371ce9a462b610529488e87a712484328a89de36
+      - greenlet==2.0.2
+      - grequests==0.7.0
       - grpcio==1.54.2
       - h5py==3.8.0
       - hickle==5.0.2
       - imagesize==1.4.1
+      - importlib-resources==6.0.0
       - install==1.3.5
+      - jaxtyping==0.2.19
+      - jinja2==2.11.3
       - loguru==0.7.0
       - looseversion==1.1.2
+      - lxml==4.9.3
       - markdown==3.4.3
-      - markupsafe==2.1.3
+      - markdown-it-py==3.0.0
+      - markupsafe==1.1.1
+      - matplotlib==3.7.2
+      - mdurl==0.1.2
       - mmtf-python==1.1.3
       - mpi4py==3.0.3
       - msgpack==1.0.5
       - multidict==6.0.4
+      - multipledispatch==1.0.0
       - multiprocess==0.70.11.1
+      - numpy==1.23.5
       - oauthlib==3.2.2
+      - ordered-set==4.1.0
       - pathos==0.2.7
       - pathtools==0.1.2
       - pdb-tools==2.5.0
+      - platformdirs==3.8.1
+      - plotly==5.15.0
       - pox==0.3.2
       - ppft==1.7.6.6
       - promise==2.3
       - protobuf==3.20.3
       - pyasn1==0.5.0
       - pyasn1-modules==0.3.0
+      - pydantic==1.10.11
       - pydeprecate==0.3.1
+      - pyparsing==3.0.9
       - pytorch-lightning==1.4.8
-      - pyyaml==6.0
+      - pyyaml==5.4.1
+      - requests-cache==1.1.0
       - requests-oauthlib==1.3.1
+      - rich==13.4.2
+      - rich-click==1.6.1
       - rsa==4.9
       - seaborn==0.12.2
       - sentry-sdk==1.24.0
       - shortuuid==1.0.11
       - smmap==5.0.0
       - snowballstemmer==2.2.0
+      - soupsieve==2.4.1
       - subprocess32==3.5.4
+      - suds-community==1.1.2
+      - tenacity==8.2.2
       - tensorboard==2.13.0
       - tensorboard-data-server==0.7.0
       - termcolor==2.3.0
       - torchmetrics==0.5.1
+      - typeguard==4.0.0
+      - url-normalize==1.4.3
       - wandb==0.12.2
       - werkzeug==2.3.6
       - wget==3.2
+      - wrapt==1.15.0
+      - xarray==2023.1.0
+      - xmltodict==0.13.0
       - yarl==1.9.2
       - yaspin==2.3.0
+      - zope-event==5.0
+      - zope-interface==6.0
@@ -8,10 +8,10 @@
 import numpy as np
 import pandas as pd
 
+from graphein.ml.datasets import PDBManager
 from pathlib import Path
 from tqdm import tqdm
 
-from project.datasets.analysis.pdb_data import PDBManager
 from project.utils.utils import download_pdb_file, gunzip_file
 
 
@@ -79,7 +79,7 @@ def main(output_dir: str, source_type: str):
                 # Collect (and, if necessary, extract) all training PDB files
                 train_pdb_codes = []
                 pairs_postprocessed_train_txt = os.path.join(output_dir, 'pairs-postprocessed-train-before-structure-based-filtering.txt')
-                assert os.path.exists(pairs_postprocessed_train_txt), "DB5-Plus train filenames must be curated in advance to partition training and validation filenames."
+                assert os.path.exists(pairs_postprocessed_train_txt), "DIPS-Plus train filenames must be curated in advance."
                 with open(pairs_postprocessed_train_txt, "r") as f:
                     train_filenames = [line.strip() for line in f.readlines()]
                 for train_filename in tqdm(train_filenames):
@@ -117,7 +117,7 @@ def main(output_dir: str, source_type: str):
                 # Collect (and, if necessary, extract) all validation PDB files
                 val_pdb_codes = []
                 pairs_postprocessed_val_txt = os.path.join(output_dir, 'pairs-postprocessed-val-before-structure-based-filtering.txt')
-                assert os.path.exists(pairs_postprocessed_val_txt), "DB5-Plus validation filenames must be curated in advance to partition training and validation filenames."
+                assert os.path.exists(pairs_postprocessed_val_txt), "DIPS-Plus validation filenames must be curated in advance."
                 with open(pairs_postprocessed_val_txt, "r") as f:
                     val_filenames = [line.strip() for line in f.readlines()]
                 for val_filename in tqdm(val_filenames):
 
@@ -30,7 +30,7 @@ def main(output_dir: str, source_type: str, feature_types_to_correlate: str):
         # Collect (and, if necessary, extract) all training PDB files
         train_feature_values = []
         pairs_postprocessed_train_txt = os.path.join(output_dir, 'pairs-postprocessed-train-before-structure-based-filtering.txt')
-        assert os.path.exists(pairs_postprocessed_train_txt), "DB5-Plus train filenames must be curated in advance to partition training and validation filenames."
+        assert os.path.exists(pairs_postprocessed_train_txt), "DIPS-Plus train filenames must be curated in advance."
         with open(pairs_postprocessed_train_txt, "r") as f:
             train_filenames = [line.strip() for line in f.readlines()]
         for train_filename in tqdm(train_filenames):
@@ -68,7 +68,7 @@ def main(output_dir: str, source_type: str, feature_types_to_correlate: str):
         # Collect (and, if necessary, extract) all validation PDB files
         val_feature_values = []
         pairs_postprocessed_val_txt = os.path.join(output_dir, 'pairs-postprocessed-val-before-structure-based-filtering.txt')
-        assert os.path.exists(pairs_postprocessed_val_txt), "DB5-Plus validation filenames must be curated in advance to partition training and validation filenames."
+        assert os.path.exists(pairs_postprocessed_val_txt), "DIPS-Plus validation filenames must be curated in advance."
         with open(pairs_postprocessed_val_txt, "r") as f:
             val_filenames = [line.strip() for line in f.readlines()]
         for val_filename in tqdm(val_filenames):
 
@@ -0,0 +1,212 @@
+import click
+import logging
+import os
+import warnings
+
+import atom3.pair as pa
+import numpy as np
+import pandas as pd
+
+from Bio import BiopythonWarning
+from Bio.PDB import NeighborSearch
+from Bio.PDB import PDBParser
+from pathlib import Path
+from tqdm import tqdm
+
+from project.utils.utils import download_pdb_file, gunzip_file
+
+
+@click.command()
+@click.argument('output_dir', default='../DIPS/final/raw', type=click.Path())
+@click.option('--source_type', default='rcsb', type=click.Choice(['rcsb', 'db5']))
+@click.option('--interfacing_water_distance_cutoff', default=10.0, type=float)
+def main(output_dir: str, source_type: str, interfacing_water_distance_cutoff: float):
+    logger = logging.getLogger(__name__)
+    logger.info("Analyzing interface waters within each dataset example...")
+
+    if source_type.lower() == "rcsb":
+        parser = PDBParser()
+
+        # Filter and suppress BioPython warnings
+        warnings.filterwarnings("ignore", category=BiopythonWarning)
+
+        # Collect (and, if necessary, extract) all training PDB files
+        train_num_complexes = 0
+        train_complex_num_waters = 0
+        pairs_postprocessed_train_txt = os.path.join(output_dir, 'pairs-postprocessed-train-before-structure-based-filtering.txt')
+        assert os.path.exists(pairs_postprocessed_train_txt), "DIPS-Plus train filenames must be curated in advance."
+        with open(pairs_postprocessed_train_txt, "r") as f:
+            train_filenames = [line.strip() for line in f.readlines()]
+        for train_filename in tqdm(train_filenames):
+            try:
+                postprocessed_train_pair: pa.Pair = pd.read_pickle(os.path.join(output_dir, train_filename))
+            except Exception as e:
+                logging.error(f"Could not open postprocessed training pair {os.path.join(output_dir, train_filename)} due to: {e}")
+                continue
+            pdb_code = postprocessed_train_pair.df0.pdb_name[0].split("_")[0][1:3]
+            pdb_dir = os.path.join(Path(output_dir).parent.parent, "raw", "pdb", pdb_code)
+            l_b_pdb_filepath = os.path.join(pdb_dir, postprocessed_train_pair.df0.pdb_name[0])
+            r_b_pdb_filepath = os.path.join(pdb_dir, postprocessed_train_pair.df1.pdb_name[0])
+            l_b_df0_chains = postprocessed_train_pair.df0.chain.unique()
+            r_b_df1_chains = postprocessed_train_pair.df1.chain.unique()
+            assert (
+                len(postprocessed_train_pair.df0.pdb_name.unique()) == len(l_b_df0_chains) == 1
+            ), "Only a single PDB filename and chain identifier can be associated with a single training example."
+            assert (
+                len(postprocessed_train_pair.df1.pdb_name.unique()) == len(r_b_df1_chains) == 1
+            ), "Only a single PDB filename and chain identifier can be associated with a single training example."
+            if not os.path.exists(l_b_pdb_filepath) and os.path.exists(l_b_pdb_filepath + ".gz"):
+                gunzip_file(l_b_pdb_filepath)
+            if not os.path.exists(r_b_pdb_filepath) and os.path.exists(r_b_pdb_filepath + ".gz"):
+                gunzip_file(r_b_pdb_filepath)
+            if not os.path.exists(l_b_pdb_filepath):
+                download_pdb_file(os.path.basename(l_b_pdb_filepath), l_b_pdb_filepath)
+            if not os.path.exists(r_b_pdb_filepath):
+                download_pdb_file(os.path.basename(r_b_pdb_filepath), r_b_pdb_filepath)
+            assert os.path.exists(l_b_pdb_filepath) and os.path.exists(r_b_pdb_filepath), "Both left and right-bound PDB files collected must exist."
+
+            l_b_structure = parser.get_structure('protein', l_b_pdb_filepath)
+            r_b_structure = parser.get_structure('protein', r_b_pdb_filepath)
+
+            l_b_interface_residues = postprocessed_train_pair.df0[postprocessed_train_pair.df0.index.isin(postprocessed_train_pair.pos_idx[:, 0])]
+            r_b_interface_residues = postprocessed_train_pair.df1[postprocessed_train_pair.df1.index.isin(postprocessed_train_pair.pos_idx[:, 1])]
+
+            train_num_complexes += 1
+
+            l_b_ns = NeighborSearch(list(l_b_structure.get_atoms()))
+            for index, row in l_b_interface_residues.iterrows():
+                chain_id = row['chain']
+                residue = row['residue'].strip()
+                model = l_b_structure[0]
+                chain = model[chain_id]
+                if residue.lstrip("-").isdigit():
+                    residue = int(residue)
+                else:
+                    residue_index, residue_icode = residue[:-1], residue[-1:]
+                    if residue_icode.strip() == "":
+                        residue = int(residue)
+                    else:
+                        residue = (" ", int(residue_index), residue_icode)
+                target_residue = chain[residue]
+                target_coords = np.array([atom.get_coord() for atom in target_residue.get_atoms() if atom.get_name() == 'CA']).squeeze()
+                interfacing_atoms = l_b_ns.search(target_coords, interfacing_water_distance_cutoff, 'A')
+                waters_within_threshold = [atom for atom in interfacing_atoms if atom.get_parent().get_resname() in ['HOH', 'WAT']]
+                train_complex_num_waters += len(waters_within_threshold)
+
+            r_b_ns = NeighborSearch(list(r_b_structure.get_atoms()))
+            for index, row in r_b_interface_residues.iterrows():
+                chain_id = row['chain']
+                residue = row['residue'].strip()
+                model = r_b_structure[0]
+                chain = model[chain_id]
+                if residue.lstrip("-").isdigit():
+                    residue = int(residue)
+                else:
+                    residue_index, residue_icode = residue[:-1], residue[-1:]
+                    residue = (" ", int(residue_index), residue_icode)
+                target_residue = chain[residue]
+                target_coords = np.array([atom.get_coord() for atom in target_residue.get_atoms() if atom.get_name() == 'CA']).squeeze()
+                interfacing_atoms = r_b_ns.search(target_coords, interfacing_water_distance_cutoff, 'A')
+                waters_within_threshold = [atom for atom in interfacing_atoms if atom.get_parent().get_resname() in ['HOH', 'WAT']]
+                train_complex_num_waters += len(waters_within_threshold)
+
+        # Collect (and, if necessary, extract) all validation PDB files
+        val_num_complexes = 0
+        val_complex_num_waters = 0
+        pairs_postprocessed_val_txt = os.path.join(output_dir, 'pairs-postprocessed-val-before-structure-based-filtering.txt')
+        assert os.path.exists(pairs_postprocessed_val_txt), "DIPS-Plus validation filenames must be curated in advance."
+        with open(pairs_postprocessed_val_txt, "r") as f:
+            val_filenames = [line.strip() for line in f.readlines()]
+        for val_filename in tqdm(val_filenames):
+            try:
+                postprocessed_val_pair: pa.Pair = pd.read_pickle(os.path.join(output_dir, val_filename))
+            except Exception as e:
+                logging.error(f"Could not open postprocessed validation pair {os.path.join(output_dir, val_filename)} due to: {e}")
+                continue
+            pdb_code = postprocessed_val_pair.df0.pdb_name[0].split("_")[0][1:3]
+            pdb_dir = os.path.join(Path(output_dir).parent.parent, "raw", "pdb", pdb_code)
+            l_b_pdb_filepath = os.path.join(pdb_dir, postprocessed_val_pair.df0.pdb_name[0])
+            r_b_pdb_filepath = os.path.join(pdb_dir, postprocessed_val_pair.df1.pdb_name[0])
+            l_b_df0_chains = postprocessed_val_pair.df0.chain.unique()
+            r_b_df1_chains = postprocessed_val_pair.df1.chain.unique()
+            assert (
+                len(postprocessed_val_pair.df0.pdb_name.unique()) == len(l_b_df0_chains) == 1
+            ), "Only a single PDB filename and chain identifier can be associated with a single validation example."
+            assert (
+                len(postprocessed_val_pair.df1.pdb_name.unique()) == len(r_b_df1_chains) == 1
+            ), "Only a single PDB filename and chain identifier can be associated with a single validation example."
+            if not os.path.exists(l_b_pdb_filepath) and os.path.exists(l_b_pdb_filepath + ".gz"):
+                gunzip_file(l_b_pdb_filepath)
+            if not os.path.exists(r_b_pdb_filepath) and os.path.exists(r_b_pdb_filepath + ".gz"):
+                gunzip_file(r_b_pdb_filepath)
+            if not os.path.exists(l_b_pdb_filepath):
+                download_pdb_file(os.path.basename(l_b_pdb_filepath), l_b_pdb_filepath)
+            if not os.path.exists(r_b_pdb_filepath):
+                download_pdb_file(os.path.basename(r_b_pdb_filepath), r_b_pdb_filepath)
+            assert os.path.exists(l_b_pdb_filepath) and os.path.exists(r_b_pdb_filepath), "Both left and right-bound PDB files collected must exist."
+
+            l_b_structure = parser.get_structure('protein', l_b_pdb_filepath)
+            r_b_structure = parser.get_structure('protein', r_b_pdb_filepath)
+
+            l_b_interface_residues = postprocessed_val_pair.df0[postprocessed_val_pair.df0.index.isin(postprocessed_val_pair.pos_idx[:, 0])]
+            r_b_interface_residues = postprocessed_val_pair.df1[postprocessed_val_pair.df1.index.isin(postprocessed_val_pair.pos_idx[:, 1])]
+
+            val_num_complexes += 1
+
+            l_b_ns = NeighborSearch(list(l_b_structure.get_atoms()))
+            for index, row in l_b_interface_residues.iterrows():
+                chain_id = row['chain']
+                residue = row['residue'].strip()
+                model = l_b_structure[0]
+                chain = model[chain_id]
+                if residue.lstrip("-").isdigit():
+                    residue = int(residue)
+                else:
+                    residue_index, residue_icode = residue[:-1], residue[-1:]
+                    residue = (" ", int(residue_index), residue_icode)
+                target_residue = chain[residue]
+                target_coords = np.array([atom.get_coord() for atom in target_residue.get_atoms() if atom.get_name() == 'CA']).squeeze()
+                interfacing_atoms = l_b_ns.search(target_coords, interfacing_water_distance_cutoff, 'A')
+                waters_within_threshold = [atom for atom in interfacing_atoms if atom.get_parent().get_resname() in ['HOH', 'WAT']]
+                val_complex_num_waters += len(waters_within_threshold)
+
+            r_b_ns = NeighborSearch(list(r_b_structure.get_atoms()))
+            for index, row in r_b_interface_residues.iterrows():
+                chain_id = row['chain']
+                residue = row['residue'].strip()
+                model = r_b_structure[0]
+                chain = model[chain_id]
+                if residue.lstrip("-").isdigit():
+                    residue = int(residue)
+                else:
+                    residue_index, residue_icode = residue[:-1], residue[-1:]
+                    residue = (" ", int(residue_index), residue_icode)
+                target_residue = chain[residue]
+                target_coords = np.array([atom.get_coord() for atom in target_residue.get_atoms() if atom.get_name() == 'CA']).squeeze()
+                interfacing_atoms = r_b_ns.search(target_coords, interfacing_water_distance_cutoff, 'A')
+                waters_within_threshold = [atom for atom in interfacing_atoms if atom.get_parent().get_resname() in ['HOH', 'WAT']]
+                val_complex_num_waters += len(waters_within_threshold)
+
+        # Train complexes
+        train_num_waters_per_complex = train_complex_num_waters / train_num_complexes
+        logging.info(f"Number of waters, on average, in each training complex: {train_num_waters_per_complex}")
+
+        # Validation complexes
+        val_num_waters_per_complex = val_complex_num_waters / val_num_complexes
+        logging.info(f"Number of waters, on average, in each validation complex: {val_num_waters_per_complex}")
+
+        # Train + Validation complexes
+        train_val_num_waters_per_complex = (train_complex_num_waters + val_complex_num_waters) / (train_num_complexes + val_num_complexes)
+        logging.info(f"Number of waters, on average, in each training (or validation) complex: {train_val_num_waters_per_complex}")
+
+        logger.info("Finished analyzing interface waters for all training and validation complexes")
+
+    else:
+        raise NotImplementedError(f"Source type {source_type} is currently not supported.")
+
+
+if __name__ == "__main__":
+    log_fmt = '%(asctime)s %(levelname)s %(process)d: %(message)s'
+    logging.basicConfig(level=logging.INFO, format=log_fmt)
+
+    main()