-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathblend.py
166 lines (147 loc) · 6.59 KB
/
blend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import numpy as np
import pickle
import sklearn.model_selection
import os
try:
# Newer versions
import tensorflow.compat.v1 as tf
except ImportError:
# Fallback for older versions
import tensorflow as tf
def tf_blend(X, y, type_to_idx, lr, steps, do_individual_scores=True):
"""
Does linear combination of solutions, where the weights
are positive and sum to 1.
"""
x = X.T
n_samples, n_features = x.shape
n_classes = len(type_to_idx)
classes = np.zeros((n_samples, n_classes))
strat = np.zeros(n_samples, dtype=int)
# Stratify CV by type
for i, (type_, idx) in enumerate(type_to_idx.items()):
classes[idx, i] = 1
strat[idx] = i
# Initialize weights to zero to avoid bias
logits_init = np.zeros((n_features,1))
# Reset graph
tf.reset_default_graph()
# Tensorflow placeholders and variables
classes_tf = tf.placeholder(tf.float32, [None, n_classes])
x_tf = tf.placeholder(tf.float32, [None, n_features])
logits_tf = tf.Variable(logits_init, dtype=tf.float32)
W = tf.nn.softmax(logits_tf, axis=0)
y_pred = tf.matmul(x_tf,W)
y_tf = tf.placeholder(tf.float32, [None,1])
abs_diff = tf.abs(y_tf-y_pred)
class_diff = abs_diff * classes_tf
cost = tf.reduce_sum(class_diff, axis=0)
mean_cost = cost / (tf.reduce_sum(classes_tf, axis=0)+1e-9)
log_cost = tf.math.log(mean_cost+1e-9) / n_classes
total_cost = tf.reduce_sum(log_cost)
test_size = 0.50
train_step = tf.train.AdamOptimizer(lr).minimize(total_cost)
# Keep track of progress
scores = []
running_weights = []
running_logits = []
# Get stratified train, test split
train, test = sklearn.model_selection.train_test_split(
np.arange(n_samples), stratify=strat, test_size=test_size,
shuffle=True, random_state=42)
batch_size = train.size/10
average_steps = 50 if steps > 100 else 20
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for i in range(steps):
for batch in np.array_split(train, train.size // batch_size):
feed = {x_tf: x[batch], y_tf: y[batch,None], classes_tf: classes[batch]}
sess.run(train_step, feed_dict=feed)
np.random.shuffle(train)
if i > 0:
weights, ensemble_loss, logits = sess.run([W, log_cost, logits_tf], feed_dict={x_tf: x[test], y_tf: y[test,None], classes_tf: classes[test]})
running_logits.append(logits)
running_weights.append(weights)
#print(i, sum(ensemble_loss), weights.squeeze()[:5])
# Average logits and get score and weights
ensemble_logits = np.mean(running_logits[-average_steps:], 0)
ensemble_loss, ensemble_weights = sess.run([log_cost, W], feed_dict={logits_tf: ensemble_logits, x_tf: x[test], y_tf: y[test,None], classes_tf: classes[test]})
scores.append(ensemble_loss)
# Do individual scores
if do_individual_scores:
test_weights = np.zeros((n_features,1))
# Calculate loss for individual solutions
for i in range(n_features):
test_weights[:] = 0
test_weights[i] = 1
test_loss = sess.run(log_cost, feed_dict={W: test_weights, x_tf: x[test], y_tf: y[test,None], classes_tf: classes[test]})
scores.append(test_loss)
return ensemble_weights, np.asarray(scores)
def write_scores(ranks, names, scores, types, filename):
with open(filename, "w") as f:
# Write header
f.write("rank, name")
for type_ in types:
f.write(f", score({type_})")
f.write(", scores(total)\n")
for i, rank in enumerate(ranks):
f.write(f"{ranks[i]}, {names[i]}")
for score in scores[i]:
f.write(f", {score:.3f}")
f.write(f", {sum(scores[i]):.3f}\n")
def write_weights(ranks, names, weights, filename, types=None, by_type=False):
with open(filename, "w") as f:
# Write header
f.write("rank, name")
if by_type:
for type_ in types:
f.write(f", weight({type_})")
else:
f.write(", weight")
f.write("\n")
for i, rank in enumerate(ranks):
f.write(f"{ranks[i]}, {names[i]}")
if by_type:
for weight in weights[i]:
f.write(f", {weight:.3f}")
else:
f.write(f", {weights[i][0]:.3f}")
f.write("\n")
if __name__ == "__main__":
# Get script location
script_dir = os.path.abspath(os.path.dirname(__file__))
try:
with open(script_dir + '/data/data.pkl', "rb") as f:
scores, data, id_to_type, id_to_idx, type_to_idx, rank, name, filenames, couplings = pickle.load(f)
except FileNotFoundError:
print("No data pickle found")
raise SystemExit
try:
with open(script_dir + '/data/analysis.pkl', "rb") as f:
weights, blend_scores = pickle.load(f)
except:
weights, blend_scores = tf_blend(data, couplings, type_to_idx, lr=0.32, steps=200)
with open(script_dir + '/data/analysis.pkl', "wb") as f:
pickle.dump((weights, blend_scores), f, -1)
try:
with open(script_dir + '/data/analysis_individual.pkl', "rb") as f:
individual_weights, individual_scores = pickle.load(f)
except:
individual_weights, individual_scores = {}, {}
for type_, idx in type_to_idx.items():
#if type_ != '1JHN\n':
# continue
d = {type_:np.arange(len(idx))}
individual_weights[type_], individual_scores[type_] = tf_blend(data[:,idx], couplings[idx],
d, lr=0.32, steps=200, do_individual_scores=False)
with open(script_dir + '/data/analysis_individual.pkl', "wb") as f:
pickle.dump((individual_weights, individual_scores), f, -1)
## Write scores of the two ensembling strategies
write_scores(["E*", "E"] + list(rank), ["Individual Ensemble", "Single Ensemble"] + list(name),
[[score[0][0]/8 for score in individual_scores.values()]] + list(blend_scores),
types=type_to_idx.keys(), filename=script_dir + '/output/scores.csv')
# Write weights of the two ensembling strategies
write_weights(rank, name, weights, filename=script_dir + '/output/single_ensemble_weights.csv')
write_weights(rank, name, [[weight[i][0] for weight in individual_weights.values()] for i in range(len(rank))],
by_type=True, types=type_to_idx.keys(), filename=script_dir + '/output/individual_ensemble_weights.csv')