Merge pull request #19 from mufeili/master

Two Submissions on BBBP
deepchem · Dec 30, 2020 · 9bc46bb · 9bc46bb
2 parents f0a50fd + de327f4
commit 9bc46bb
Show file tree

Hide file tree

Showing 7 changed files with 39 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -6,12 +6,21 @@
 
 | Rank | Model         | Featurization  | Test ROC-AUC     | Validation ROC-AUC | Contact                           | References	                                                                           | Date           |
 | ---- | ------------- | -------------- | ---------------- | ------------------ | --------------------------------- | ---------------------------------------------------------------------------------------- | -------------- |
-| 1    | Random Forest | 1024-bit ECFP2 | 0.8507 +- 0.0072 | 0.7368 +- 0.0066   | [Mufei Li]([email protected]) | [Paper](https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf), [Code](./examples) | Dec 2nd, 2020  |
+| 1    | Random Forest | 1024-bit ECFP4 | 0.8507 +- 0.0072 | 0.7368 +- 0.0066   | [Mufei Li]([email protected]) | [Paper](https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf), [Code](./examples) | Dec 2nd, 2020  |
 | 2    | GCN           | GraphConv      | 0.8175 +- 0.0193 | 0.7430 +- 0.0194   | [Mufei Li]([email protected]) | [Paper](https://arxiv.org/abs/1609.02907), [Code](./examples)                            | Dec 20th, 2020 |
 
 ### BACE Regression
 
 | Rank | Model         | Featurization  | Test RMSE        | Validation RMSE  | Contact                           | References	                                                                             | Date           |
 | ---- | ------------- | -------------- | ---------------- | ---------------- | --------------------------------- | ---------------------------------------------------------------------------------------- | -------------- |
-| 1    | Random Forest | 1024-bit ECFP2 | 1.3178 +- 0.0081 | 0.6716 +- 0.0059 | [Mufei Li]([email protected]) | [Paper](https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf), [Code](./examples) | Dec 26th, 2020 |
+| 1    | Random Forest | 1024-bit ECFP4 | 1.3178 +- 0.0081 | 0.6716 +- 0.0059 | [Mufei Li]([email protected]) | [Paper](https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf), [Code](./examples) | Dec 26th, 2020 |
 | 2    | GCN           | GraphConv      | 1.6450 +- 0.1325 | 0.5244 +- 0.0200 | [Mufei Li]([email protected]) | [Paper](https://arxiv.org/abs/1609.02907), [Code](./examples)                            | Dec 26th, 2020 |
+
+## Physiology
+
+### BBBP
+
+| Rank | Model         | Featurization  | Test ROC-AUC     | Validation ROC-AUC | Contact                           | References	                                                                           | Date           |
+| ---- | ------------- | -------------- | ---------------- | ------------------ | --------------------------------- | ---------------------------------------------------------------------------------------- | -------------- |
+| 1    | Random Forest | 1024-bit ECFP4 | 0.9540 +- 0.0038 | 0.9062 +- 0.0079   | [Mufei Li]([email protected]) | [Paper](https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf), [Code](./examples) | Dec 30th, 2020 |
+| 2    | GCN           | GraphConv      | 0.9214 +- 0.0106 | 0.9445 +- 0.0049   | [Mufei Li]([email protected]) | [Paper](https://arxiv.org/abs/1609.02907), [Code](./examples)                            | Dec 30th, 2020 |
diff --git a/examples/README.md b/examples/README.md
@@ -17,6 +17,7 @@ The feasible arguments include:
     - Specifies the dataset to use, which can be one of the following:
         - `BACE_classification`
         - `BACE_regression`
+        - `BBBP`
 - **Hyperparameter Search (optional)**: `-hs`
     - Perform a hyperparameter search using Bayesian optimization. It determines the best 
       hyperparameters based on the validation metric averaged across 3 runs.
@@ -38,6 +39,7 @@ The feasible arguments include:
     - Specifies the dataset to use, which can be one of the following:
         - `BACE_classification`
         - `BACE_regression`
+        - `BBBP`
 - **Hyperparameter Search (optional)**: `-hs`
     - Perform a hyperparameter search using Bayesian optimization. It determines the best 
       hyperparameters based on the validation metric averaged across 3 runs.

diff --git a/examples/configures/GCN_GC/BBBP.json b/examples/configures/GCN_GC/BBBP.json
@@ -0,0 +1,8 @@
+{
+  "batchnorm": true,
+  "dropout": 0.14190910017170108,
+  "hidden_feats": 128,
+  "lr": 0.11349404033122097,
+  "num_gnn_layers": 3,
+  "residual": true
+}
diff --git a/examples/configures/RF_ECFP/BBBP.json b/examples/configures/RF_ECFP/BBBP.json
@@ -0,0 +1,6 @@
+{
+  "bootstrap": true,
+  "criterion": "entropy",
+  "min_samples_split": 8,
+  "n_estimators": 100
+}
diff --git a/examples/fingerprint.py b/examples/fingerprint.py
@@ -28,7 +28,7 @@ def rf_model_builder(model_dir, hyperparams, mode):
 
 
 def load_model(args, tasks, hyperparams):
-  if args['dataset'] in ['BACE_classification']:
+  if args['dataset'] in ['BACE_classification', 'BBBP']:
     mode = 'classification'
   elif args['dataset'] in ['BACE_regression']:
     mode = 'regression'
@@ -100,7 +100,7 @@ def init_hyper_search_space(args):
         'min_samples_split': hp.choice('min_samples_split', [2, 4, 8, 16, 32]),
         'bootstrap': hp.choice('bootstrap', [True, False]),
     }
-    if args['dataset'] in ['BACE_classification']:
+    if args['dataset'] in ['BACE_classification', 'BBBP']:
       search_space['criterion'] = hp.choice('criterion', ["gini", "entropy"])
     else:
       search_space['criterion'] = hp.choice('criterion', ["mse", "mae"])
@@ -154,7 +154,7 @@ def objective(hyperparams):
   parser.add_argument(
       '-d',
       '--dataset',
-      choices=['BACE_classification', 'BACE_regression'],
+      choices=['BACE_classification', 'BACE_regression', 'BBBP'],
       help='Dataset to use')
   parser.add_argument(
       '-m',
@@ -205,7 +205,7 @@ def objective(hyperparams):
   else:
     print('Use the manually specified hyperparameters')
     with open('configures/{}_{}/{}.json'.format(
-            args['model'], args['featurizer'], args['dataset'])) as f:
+        args['model'], args['featurizer'], args['dataset'])) as f:
       default_hyperparams = json.load(f)
     val_metrics, test_metrics = main(args['result_path'], args,
                                      default_hyperparams)

diff --git a/examples/gnn.py b/examples/gnn.py
@@ -11,7 +11,7 @@
 
 
 def load_model(save_pth, args, tasks, hyperparams):
-  if args['dataset'] in ['BACE_classification']:
+  if args['dataset'] in ['BACE_classification', 'BBBP']:
     mode = 'classification'
     # binary classification
     n_classes = 2
@@ -177,7 +177,7 @@ def objective(hyperparams):
   parser.add_argument(
       '-d',
       '--dataset',
-      choices=['BACE_classification', 'BACE_regression'],
+      choices=['BACE_classification', 'BACE_regression', 'BBBP'],
       help='Dataset to use')
   parser.add_argument(
       '-m',
@@ -236,7 +236,7 @@ def objective(hyperparams):
   else:
     print('Use the manually specified hyperparameters')
     with open('configures/{}_{}/{}.json'.format(
-            args['model'], args['featurizer'], args['dataset'])) as f:
+        args['model'], args['featurizer'], args['dataset'])) as f:
       default_hyperparams = json.load(f)
     val_metrics, test_metrics = main(args['result_path'], args,
                                      default_hyperparams)

diff --git a/examples/utils.py b/examples/utils.py
@@ -4,7 +4,7 @@
 
 
 def decide_metric(dataset):
-  if dataset == 'BACE_classification':
+  if dataset in ['BACE_classification', 'BBBP']:
     return 'roc_auc'
   elif dataset == 'BACE_regression':
     return 'rmse'
@@ -67,6 +67,10 @@ def load_dataset(args):
     from deepchem.molnet import load_bace_classification
     tasks, all_dataset, transformers = load_bace_classification(
         featurizer=featurizer, splitter=splitter, reload=False)
+  elif args['dataset'] == 'BBBP':
+    from deepchem.molnet import load_bbbp
+    tasks, all_dataset, transformers = load_bbbp(
+        featurizer=featurizer, splitter=splitter, reload=False)
   elif args['dataset'] == 'BACE_regression':
     from deepchem.molnet import load_bace_regression
     tasks, all_dataset, transformers = load_bace_regression(