BioinfoMachineLearning
diff --git a/Diff for: ‎project/datasets/CASP_CAPRI/casp_capri_dgl_data_module.py
+7-7 b/Diff for: ‎project/datasets/CASP_CAPRI/casp_capri_dgl_data_module.py
+7-7
diff --git a/Diff for: ‎project/datasets/CASP_CAPRI/casp_capri_dgl_dataset.py
+3-26 b/Diff for: ‎project/datasets/CASP_CAPRI/casp_capri_dgl_dataset.py
+3-26
diff --git a/Diff for: ‎project/datasets/DIPS/dips_dgl_data_module.py
+8-10 b/Diff for: ‎project/datasets/DIPS/dips_dgl_data_module.py
+8-10
diff --git a/Diff for: ‎project/datasets/DIPS/dips_dgl_dataset.py
+2-25 b/Diff for: ‎project/datasets/DIPS/dips_dgl_dataset.py
+2-25
@@ -19,7 +19,7 @@ class CASPCAPRIDGLDataModule(LightningDataModule):
     casp_capri_test = None
 
     def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int, knn: int, self_loops: bool,
-                 pn_ratio: float, percent_to_use: float, use_dgl: bool, process_complexes: bool, input_indep: bool):
+                 pn_ratio: float, percent_to_use: float, process_complexes: bool, input_indep: bool):
         super().__init__()
 
         self.data_dir = data_dir
@@ -29,26 +29,26 @@ def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int,
         self.self_loops = self_loops
         self.pn_ratio = pn_ratio
         self.percent_to_use = percent_to_use  # Fraction of CASP-CAPRI dataset splits to use
-        self.use_dgl = use_dgl  # Whether to process each complex into a pair of DGL graphs for its final representation
         self.process_complexes = process_complexes  # Whether to process any unprocessed complexes before training
         self.input_indep = input_indep  # Whether to use an input-independent pipeline to train the model
+        self.collate_fn = dgl_picp_collate  # Which collation function to use
 
     def setup(self, stage: Optional[str] = None):
         # Assign testing dataset for use in DataLoaders - called on every GPU
         self.casp_capri_test = CASPCAPRIDGLDataset(mode='test', raw_dir=self.data_dir, knn=self.knn,
-                                                   self_loops=self.self_loops, pn_ratio=self.pn_ratio,
-                                                   percent_to_use=self.percent_to_use, use_dgl=self.use_dgl,
+                                                   geo_nbrhd_size=2, self_loops=self.self_loops, pn_ratio=self.pn_ratio,
+                                                   percent_to_use=self.percent_to_use,
                                                    process_complexes=self.process_complexes,
                                                    input_indep=self.input_indep)
 
     def train_dataloader(self) -> DataLoader:
         return DataLoader(self.casp_capri_test, batch_size=self.batch_size, shuffle=True,
-                          num_workers=self.num_dataloader_workers, collate_fn=dgl_picp_collate, pin_memory=True)
+                          num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)
 
     def val_dataloader(self) -> DataLoader:
         return DataLoader(self.casp_capri_test, batch_size=self.batch_size, shuffle=False,
-                          num_workers=self.num_dataloader_workers, collate_fn=dgl_picp_collate, pin_memory=True)
+                          num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)
 
     def test_dataloader(self) -> DataLoader:
         return DataLoader(self.casp_capri_test, batch_size=self.batch_size, shuffle=False,
-                          num_workers=self.num_dataloader_workers, collate_fn=dgl_picp_collate, pin_memory=True)
+                          num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)
@@ -34,12 +34,10 @@ class CASPCAPRIDGLDataset(DGLDataset):
         Size of each edge's neighborhood when updating geometric edge features. Default: 2.
     self_loops: bool
         Whether to connect a given node to itself. Default: True.
-    pn_ratio: bool
+    pn_ratio: float
         The positive-negative ratio to use when assembling training labels for node-node pairs. Default: 0.1.
     percent_to_use: float
         How much of the dataset to load. Default: 1.00.
-    use_dgl: bool
-        Whether to process each complex into a pair of DGL graphs for its final representation. Default: True.
     process_complexes: bool
         Whether to process each unprocessed complex as we load in the dataset. Default: True.
     input_indep: bool
@@ -72,7 +70,6 @@ def __init__(self,
                  self_loops=True,
                  pn_ratio=0.1,
                  percent_to_use=1.00,
-                 use_dgl=True,
                  process_complexes=True,
                  input_indep=False,
                  force_reload=False,
@@ -87,7 +84,6 @@ def __init__(self,
         self.self_loops = self_loops
         self.pn_ratio = pn_ratio
         self.percent_to_use = percent_to_use  # How much of the dataset (e.g. CASP-CAPRI training dataset) to use
-        self.use_dgl = use_dgl  # Whether to process each complex into a pair of DGL graphs for its final representation
         self.process_complexes = process_complexes  # Whether to process any unprocessed complexes before training
         self.input_indep = input_indep  # Whether to use an input-independent pipeline to train the model
         self.final_dir = os.path.join(*self.root.split(os.sep)[:-1])
@@ -163,9 +159,8 @@ def process(self):
                 if not os.path.exists(processed_filepath):
                     processed_parent_dir_to_make = os.path.join(self.processed_dir, os.path.split(raw_path[0])[0])
                     os.makedirs(processed_parent_dir_to_make, exist_ok=True)
-                    process_complex_into_dict(raw_filepath, processed_filepath,
-                                              self.knn, self.geo_nbrhd_size, self.self_loops,
-                                              check_sequence=False, use_dgl=self.use_dgl)
+                    process_complex_into_dict(raw_filepath, processed_filepath, self.knn,
+                                              self.geo_nbrhd_size, self.self_loops, check_sequence=False)
 
     def has_cache(self):
         """Check if each complex is downloaded and available for testing."""
@@ -189,28 +184,10 @@ def __getitem__(self, idx):
         -------
         :class:`dict`
 
-            (If process_complexes_into_dicts() was run with use_dgl=True):
-            Protein complex, DGLGraphs for each of the complex's structures.
-
     - ``complex['graph1']:`` DGLGraph (of length M) containing each of the first graph's encoded node and edge features
     - ``complex['graph2']:`` DGLGraph (of length N) containing each of the second graph's encoded node and edge features
     - ``complex['examples']:`` PyTorch Tensor (of shape (M x N) x 3) containing the labels for inter-graph node pairs
     - ``complex['complex']:`` Python string describing the complex's code and original pdb filename
-    - ``complex['filepath']:`` Python string describing the complex's filepath
-
-            (If process_complexes_into_dicts() was run with use_dgl=False):
-            Protein complex, feature tensors for each node and edge and indices of each node's neighboring nodes.
-
-    - ``complex['graph1_node_feats']:`` PyTorch Tensor containing each of the first graph's encoded node features
-    - ``complex['graph2_node_feats']``: PyTorch Tensor containing each of the second graph's encoded node features
-    - ``complex['graph1_node_coords']:`` PyTorch Tensor containing each of the first graph's node coordinates
-    - ``complex['graph2_node_coords']``: PyTorch Tensor containing each of the second graph's node coordinates
-    - ``complex['graph1_edge_feats']:`` PyTorch Tensor containing each of the first graph's edge features for each node
-    - ``complex['graph2_edge_feats']:`` PyTorch Tensor containing each of the second graph's edge features for each node
-    - ``complex['graph1_nbrhd_indices']:`` PyTorch Tensor containing each of the first graph's neighboring node indices
-    - ``complex['graph2_nbrhd_indices']:`` PyTorch Tensor containing each of the second graph's neighboring node indices
-    - ``complex['examples']:`` PyTorch Tensor containing the labels for inter-graph node pairs
-    - ``complex['complex']:`` Python string describing the complex's code and original pdb filename
     - ``complex['filepath']:`` Python string describing the complex's filepath
         """
         # Assemble filepath of processed protein complex
 
@@ -21,7 +21,7 @@ class DIPSDGLDataModule(LightningDataModule):
     dips_test = None
 
     def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int, knn: int, self_loops: bool,
-                 pn_ratio: float, percent_to_use: float, use_dgl: bool, process_complexes: bool, input_indep: bool):
+                 pn_ratio: float, percent_to_use: float, process_complexes: bool, input_indep: bool):
         super().__init__()
 
         self.data_dir = data_dir
@@ -31,32 +31,30 @@ def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int,
         self.self_loops = self_loops
         self.pn_ratio = pn_ratio
         self.percent_to_use = percent_to_use  # Fraction of DIPS dataset splits to use
-        self.use_dgl = use_dgl  # Whether to process each complex into a pair of DGL graphs for its final representation
         self.process_complexes = process_complexes  # Whether to process any unprocessed complexes before training
         self.input_indep = input_indep  # Whether to use an input-independent pipeline to train the model
+        self.collate_fn = dgl_picp_collate  # Which collation function to use
 
     def setup(self, stage: Optional[str] = None):
         # Assign training/validation/testing data set for use in DataLoaders - called on every GPU
         self.dips_train = DIPSDGLDataset(mode='train', raw_dir=self.data_dir, knn=self.knn, self_loops=self.self_loops,
                                          pn_ratio=self.pn_ratio, percent_to_use=self.percent_to_use,
-                                         use_dgl=self.use_dgl, process_complexes=self.process_complexes,
-                                         input_indep=self.input_indep)
+                                         process_complexes=self.process_complexes, input_indep=self.input_indep)
         self.dips_val = DIPSDGLDataset(mode='val', raw_dir=self.data_dir, knn=self.knn, self_loops=self.self_loops,
-                                       pn_ratio=self.pn_ratio, percent_to_use=self.percent_to_use, use_dgl=self.use_dgl,
+                                       pn_ratio=self.pn_ratio, percent_to_use=self.percent_to_use,
                                        process_complexes=self.process_complexes, input_indep=self.input_indep)
         self.dips_test = DIPSDGLDataset(mode='test', raw_dir=self.data_dir, knn=self.knn, self_loops=self.self_loops,
                                         pn_ratio=self.pn_ratio, percent_to_use=self.percent_to_use,
-                                        use_dgl=self.use_dgl, process_complexes=self.process_complexes,
-                                        input_indep=self.input_indep)
+                                        process_complexes=self.process_complexes, input_indep=self.input_indep)
 
     def train_dataloader(self) -> DataLoader:
         return DataLoader(self.dips_train, batch_size=self.batch_size, shuffle=True,
-                          num_workers=self.num_dataloader_workers, collate_fn=dgl_picp_collate, pin_memory=True)
+                          num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)
 
     def val_dataloader(self) -> DataLoader:
         return DataLoader(self.dips_val, batch_size=self.batch_size, shuffle=False,
-                          num_workers=self.num_dataloader_workers, collate_fn=dgl_picp_collate, pin_memory=True)
+                          num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)
 
     def test_dataloader(self) -> DataLoader:
         return DataLoader(self.dips_test, batch_size=self.batch_size, shuffle=False,
-                          num_workers=self.num_dataloader_workers, collate_fn=dgl_picp_collate, pin_memory=True)
+                          num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)
@@ -45,8 +45,6 @@ class DIPSDGLDataset(DGLDataset):
         The positive-negative ratio to use when assembling training labels for node-node pairs. Default: 0.1.
     percent_to_use: float
         How much of the dataset to load. Default: 1.00.
-    use_dgl: bool
-        Whether to process each complex into a pair of DGL graphs for its final representation. Default: True.
     process_complexes: bool
         Whether to process each unprocessed complex as we load in the dataset. Default: True.
     input_indep: bool
@@ -83,7 +81,6 @@ def __init__(self,
                  self_loops=True,
                  pn_ratio=0.1,
                  percent_to_use=1.00,
-                 use_dgl=True,
                  process_complexes=True,
                  input_indep=False,
                  train_viz=False,
@@ -99,7 +96,6 @@ def __init__(self,
         self.self_loops = self_loops
         self.pn_ratio = pn_ratio
         self.percent_to_use = percent_to_use  # How much of the requested dataset (e.g. DIPS-Plus) to use
-        self.use_dgl = use_dgl  # Whether to process each complex into a pair of DGL graphs for its final representation
         self.process_complexes = process_complexes  # Whether to process any unprocessed complexes before training
         self.input_indep = input_indep  # Whether to use an input-independent pipeline to train the model
         self.train_viz = train_viz  # Whether to curate the training loop's validation samples for visualization
@@ -183,9 +179,8 @@ def process(self):
                 if not os.path.exists(processed_filepath):
                     processed_parent_dir_to_make = os.path.join(self.processed_dir, os.path.split(raw_path[0])[0])
                     os.makedirs(processed_parent_dir_to_make, exist_ok=True)
-                    process_complex_into_dict(raw_filepath, processed_filepath,
-                                              self.knn, self.geo_nbrhd_size, self.self_loops,
-                                              check_sequence=False, use_dgl=self.use_dgl)
+                    process_complex_into_dict(raw_filepath, processed_filepath, self.knn,
+                                              self.geo_nbrhd_size, self.self_loops, check_sequence=False)
 
     def has_cache(self):
         """Check if each complex is downloaded and available for training, validation, or testing."""
@@ -209,28 +204,10 @@ def __getitem__(self, idx):
         -------
         :class:`dict`
 
-            (If process_complexes_into_dicts() was run with use_dgl=True):
-            Protein complex, DGLGraphs for each of the complex's structures.
-
     - ``complex['graph1']:`` DGLGraph (of length M) containing each of the first graph's encoded node and edge features
     - ``complex['graph2']:`` DGLGraph (of length N) containing each of the second graph's encoded node and edge features
     - ``complex['examples']:`` PyTorch Tensor (of shape (M x N) x 3) containing the labels for inter-graph node pairs
     - ``complex['complex']:`` Python string describing the complex's code and original pdb filename
-    - ``complex['filepath']:`` Python string describing the complex's filepath
-
-            (If process_complexes_into_dicts() was run with use_dgl=False):
-            Protein complex, feature tensors for each node and edge and indices of each node's neighboring nodes.
-
-    - ``complex['graph1_node_feats']:`` PyTorch Tensor containing each of the first graph's encoded node features
-    - ``complex['graph2_node_feats']``: PyTorch Tensor containing each of the second graph's encoded node features
-    - ``complex['graph1_node_coords']:`` PyTorch Tensor containing each of the first graph's node coordinates
-    - ``complex['graph2_node_coords']``: PyTorch Tensor containing each of the second graph's node coordinates
-    - ``complex['graph1_edge_feats']:`` PyTorch Tensor containing each of the first graph's edge features for each node
-    - ``complex['graph2_edge_feats']:`` PyTorch Tensor containing each of the second graph's edge features for each node
-    - ``complex['graph1_nbrhd_indices']:`` PyTorch Tensor containing each of the first graph's neighboring node indices
-    - ``complex['graph2_nbrhd_indices']:`` PyTorch Tensor containing each of the second graph's neighboring node indices
-    - ``complex['examples']:`` PyTorch Tensor containing the labels for inter-graph node pairs
-    - ``complex['complex']:`` Python string describing the complex's code and original pdb filename
     - ``complex['filepath']:`` Python string describing the complex's filepath
         """
         # Assemble filepath of processed protein complex