|
4 | 4 |
|
5 | 5 | import atom3.pair as pa
|
6 | 6 | import matplotlib.pyplot as plt
|
| 7 | +import numpy as np |
7 | 8 | import pandas as pd
|
8 | 9 | import seaborn as sns
|
9 | 10 |
|
|
17 | 18 | @click.command()
|
18 | 19 | @click.argument('output_dir', default='../DIPS/final/raw', type=click.Path())
|
19 | 20 | @click.option('--source_type', default='rcsb', type=click.Choice(['rcsb', 'db5']))
|
20 |
| -@click.option('--feature_types_to_correlate', default='rcsb', type=click.Choice(['rsa_value-rd_value'])) |
| 21 | +@click.option('--feature_types_to_correlate', default='rcsb', type=click.Choice(['rsa_value-rd_value', 'rsa_value-cn_value', 'rd_value-cn_value'])) |
21 | 22 | def main(output_dir: str, source_type: str, feature_types_to_correlate: str):
|
22 | 23 | logger = logging.getLogger(__name__)
|
23 | 24 | logger.info("Analyzing feature correlation for each dataset example...")
|
24 | 25 |
|
25 |
| - if feature_types_to_correlate == "rsa_value-rd_value": |
26 |
| - features_to_correlate = feature_types_to_correlate.split("-") |
27 |
| - else: |
28 |
| - raise NotImplementedError(f"Feature types {features_to_correlate} are currently not supported.") |
| 26 | + features_to_correlate = feature_types_to_correlate.split("-") |
29 | 27 | assert len(features_to_correlate) == 2, "Exactly two features may be currently compared for correlation measures."
|
30 | 28 |
|
31 | 29 | if source_type.lower() == "rcsb":
|
@@ -63,8 +61,8 @@ def main(output_dir: str, source_type: str, feature_types_to_correlate: str):
|
63 | 61 | download_pdb_file(os.path.basename(r_b_pdb_filepath), r_b_pdb_filepath)
|
64 | 62 | assert os.path.exists(l_b_pdb_filepath) and os.path.exists(r_b_pdb_filepath), "Both left and right-bound PDB files collected must exist."
|
65 | 63 |
|
66 |
| - l_b_df0_feature_values = postprocessed_train_pair.df0[features_to_correlate].dropna() |
67 |
| - r_b_df1_feature_values = postprocessed_train_pair.df1[features_to_correlate].dropna() |
| 64 | + l_b_df0_feature_values = postprocessed_train_pair.df0[features_to_correlate].applymap(lambda x: np.nan if x == 'NA' else x).dropna().apply(pd.to_numeric) |
| 65 | + r_b_df1_feature_values = postprocessed_train_pair.df1[features_to_correlate].applymap(lambda x: np.nan if x == 'NA' else x).dropna().apply(pd.to_numeric) |
68 | 66 | train_feature_values.append(pd.concat([l_b_df0_feature_values, r_b_df1_feature_values]))
|
69 | 67 |
|
70 | 68 | # Collect (and, if necessary, extract) all validation PDB files
|
@@ -101,8 +99,8 @@ def main(output_dir: str, source_type: str, feature_types_to_correlate: str):
|
101 | 99 | download_pdb_file(os.path.basename(r_b_pdb_filepath), r_b_pdb_filepath)
|
102 | 100 | assert os.path.exists(l_b_pdb_filepath) and os.path.exists(r_b_pdb_filepath), "Both left and right-bound PDB files collected must exist."
|
103 | 101 |
|
104 |
| - l_b_df0_feature_values = postprocessed_val_pair.df0[features_to_correlate].dropna() |
105 |
| - r_b_df1_feature_values = postprocessed_val_pair.df1[features_to_correlate].dropna() |
| 102 | + l_b_df0_feature_values = postprocessed_val_pair.df0[features_to_correlate].applymap(lambda x: np.nan if x == 'NA' else x).dropna().apply(pd.to_numeric) |
| 103 | + r_b_df1_feature_values = postprocessed_val_pair.df1[features_to_correlate].applymap(lambda x: np.nan if x == 'NA' else x).dropna().apply(pd.to_numeric) |
106 | 104 | val_feature_values.append(pd.concat([l_b_df0_feature_values, r_b_df1_feature_values]))
|
107 | 105 |
|
108 | 106 | # Train PDBs
|
|
0 commit comments