protein_regression/make_plot_brute_force.py at master · MachineLearningLifeScience/protein_regression · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import mlflow
import numpy as np
from gpflow.kernels import SquaredExponential
from mlflow.entities import ViewType

from algorithms.gp_on_real_space import GPonRealSpace
from algorithms.KNN import KNN
from algorithms.one_hot_gp import GPOneHotSequenceSpace
from algorithms.random_forest import RandomForest
from data.load_dataset import get_alphabet
from data.train_test_split import RandomSplitter
from util.mlflow.constants import (
    DATASET,
    METHOD,
    MSE,
    NONSENSE,
    ONE_HOT,
    REPRESENTATION,
    SPLIT,
    TRANSFORMER,
    VAE,
)
from util.mlflow.convenience_functions import find_experiments_by_tags
from visualization.plot_metric_for_dataset import plot_metric_for_dataset

# gathers all our results and saves them into a numpy array
datasets = ["MTH3", "TIMB", "UBQT", "1FQG", "CALM", "BRCA"]
train_test_splitter = RandomSplitter  # RandomSplitter # BlockPostionSplitter
metric = MSE
representations = [VAE, TRANSFORMER, ONE_HOT, NONSENSE]
# representations = [ONE_HOT, NONSENSE]
results_dict = {}
last_result_length = None

algos = {
    VAE: [
        GPonRealSpace(),
        GPonRealSpace(kernel=SquaredExponential()),
        RandomForest(),
        KNN(),
    ],
    TRANSFORMER: [
        GPonRealSpace(),
        GPonRealSpace(kernel=SquaredExponential()),
        RandomForest(),
        KNN(),
    ],
    ONE_HOT: [
        GPOneHotSequenceSpace(alphabet_size=len(get_alphabet("BRCA"))),
        GPOneHotSequenceSpace(
            alphabet_size=len(get_alphabet("BRCA")), kernel=SquaredExponential()
        ),
        RandomForest(),
        KNN(),
    ],
    NONSENSE: [
        GPonRealSpace(),
        GPonRealSpace(kernel=SquaredExponential()),
        RandomForest(),
        KNN(),
    ],
}

for dataset in datasets:
    result_dict = {}
    for repr in algos.keys():
        for a in algos[repr]:
            exps = find_experiments_by_tags(
                {
                    DATASET: dataset,
                    METHOD: a.get_name(),
                    REPRESENTATION: repr,
                    SPLIT: train_test_splitter(dataset).get_name(),
                }
            )
            assert len(exps) == 1, repr + a.get_name() + dataset
            runs = mlflow.search_runs(
                experiment_ids=[exps[0].experiment_id],
                run_view_type=ViewType.ACTIVE_ONLY,
            )
            results = []
            for id in runs["run_id"].to_list():
                for r in mlflow.tracking.MlflowClient().get_metric_history(id, metric):
                    results.append(r.value)
            result_dict[repr + " " + a.get_name()] = results
    results_dict[dataset] = result_dict

print(results_dict)
plot_metric_for_dataset(
    metric_values=results_dict, cvtype=train_test_splitter(dataset).get_name()
)