-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain.py
118 lines (91 loc) · 4.29 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import numpy as np
np.random.seed(1234)
import pandas as pd
import uproot
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
# TODO: Add more variables
# Specify variables which shall be used as input for the neural network
variables = ["Tau_pt"]
if __name__ == "__main__":
# Read NanoAOD files
def load_tree(filename):
tree = uproot.open(filename)["Events"]
data = {}
for name in variables:
# Only use tau lepton leading in pT. Set a default (here -10) for events which has no tau lepton at all.
if "Tau_" in name:
data[name] = [x[0] if len(x) > 0 else -10 for x in tree.array(name).tolist()]
else:
data[name] = tree.array(name)
return pd.DataFrame(data)
signal = load_tree("htt_nano.root")
background = load_tree("qcd_nano.root")
# Skim dataset
def skim(df):
# Remove events which have not tau lepton at all, determined by the fact that we set Tau_pt to -10 in these cases.
return df[df["Tau_pt"] > 0]
signal = skim(signal)
background = skim(background)
# Compute training weights.
sum_entries = len(signal) + len(background)
def reweight(df):
# Reweight samples to the same importance - this will tell the neural net that we are equally interested in finding fake and genuine tau leptons.
# Is this the optimal setting?
df["training_weight"] = sum_entries / float(len(df))
reweight(signal)
reweight(background)
# Load data in format understood by Keras
def to_numpy(df, class_label):
x = df.as_matrix(columns=variables)
y = np.ones(len(df)) * class_label
w = df.as_matrix(columns=["training_weight"])
return x, y, w
# The input variables are stored in an array "x", the desired output are stored in an array "y". In our case this is "0" for fake taus, and "1" for genuine taus.
x_sig, y_sig, w_sig = to_numpy(signal, class_label=1)
x_bkg, y_bkg, w_bkg = to_numpy(background, class_label=0)
# Stack numpy arrays of different classes to single array
x = np.vstack([x_sig, x_bkg])
y = np.hstack([y_sig, y_bkg]).squeeze()
w = np.vstack([w_sig, w_bkg]).squeeze()
print ("Input variables: ",x)
print ("True output class: ",y)
print ("Training weights: ",w)
# Train preprocessing and transform inputs
# This will transform variable distributions to the same mean and variance
scaler = StandardScaler().fit(x)
x = scaler.transform(x)
# TODO: Create new Sequential neural network called "model" (see tutorial on slides or https://keras.io/)
# TODO: Add one dense hidden layer (around 100 neurons), and one dense output layer (1 neuron). Use a "relu" activation function for the hidden layer, and a "sigmoid" activation function for the output layer.
# model =
# TODO: Specify loss function and optimizer algorithm
# Print architecture
model.summary()
# Split dataset in training and validation part used for the gradient steps and monitoring of the training
x_train, x_val, y_train, y_val, w_train, w_val = train_test_split(x, y, w, test_size=0.5, random_state=1234)
# TODO: Declare callbacks to be used during training (keras.callbacks.ModelCheckpoint and keras.callbacks.EarlyStopping)
# Early stopping will stop training if no improvement was achieved the last N epochs (N is set by "patience" parameter)
# TODO: Train the neural network
# Plot loss on training and validation dataset
plt.figure(figsize=(6,6))
epochs = range(1, len(history.history["loss"])+1)
plt.plot(epochs, history.history["loss"], "o-", label="Training")
plt.plot(epochs, history.history["val_loss"], "o-", label="Validation")
plt.xlabel("Epochs"), plt.ylabel("Loss")
plt.yscale("log")
plt.legend()
plt.tight_layout()
plt.savefig("loss.png")
# TODO
# Make prediction on validation dataset and compute ROC and AUC
# TODO
# Get NN scores for true signal and true background events
# TODO
# Plot NN output
# Serialize the inputs for the analysis of the gradients
pickle.dump(x, open("x.pickle", "wb"))