-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCNN Training Code.py
104 lines (78 loc) · 4.18 KB
/
CNN Training Code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# read in training data
dataframe = pd.read_csv('labels_new.csv')
#set image size and number of classes being classied
NUM_CLASSES = 2
IMG_WIDTH = 90
IMG_HEIGHT = 335
# selected_docs is an array holding the number of classes wanted to train and test on (in this case both - 2 classes)
selected_docs = list(dataframe.groupby('aligned').count().sort_values(by='id', ascending=False).head(NUM_CLASSES).index)
# df_sub_train is filled with the records that are associated with the two classes held in selected_docs.
df_sub_train = dataframe[dataframe['aligned'].isin(selected_docs)]
# targets is a series object. A series object is like a mix between a dataframe and an array. In other words it is
# an array with axis labels or titles.
targets = pd.Series(df_sub_train['aligned'])
# one_hot is a dataframe where the shred pairs class labels have been one hot encoded.
# Whether it is a match or not is given a unique label that can be used to easily identify a match.
one_hot = pd.get_dummies(targets, sparse=True)
# one_hot_labels uses the data from the one_hot dataframe to create a 2D
# array where each array is an identifier for whether the doc is a match or not.
one_hot_labels = np.asarray(one_hot)
# One Hot Encode the classes
data = np.array(selected_docs)
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(data.reshape(-1,1))
# Function to read an image and resize it accordingly
def read_img(img_id):
img = cv2.imread(img_id)
return cv2.resize(img, (IMG_WIDTH, IMG_HEIGHT))
# set up images, labels and classes array's
images = []
image_ids = []
classes = []
# Load all the images/image ids/class labels and append them in appropiate array
for img_id, aligned in tqdm.tqdm(df_sub_train.values):
#append remaining images with ID tags to appropiate arrays
images.append(read_img(img_id))
image_ids.append(img_id)
classes.append(aligned)
# Train test split data
# Split data into train and test
# Let's store the image ids as X and the one hot labels as Y
X = np.array(image_ids)
Y = np.array(one_hot_labels)
# Initially split the data into train and test/val arrays
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, shuffle=True)
# After splitting the data in their individual arrays, you can load the images
# for each of the image ids stored in the x_train/x_test/x_val variables
x_train_images = []
x_test_images = []
# append labels to train/test images
for i in tqdm.tqdm(range(0, len(x_train))):
x_train_images.append(read_img(x_train[i]))
x_train_images = np.array(x_train_images)
for i in tqdm.tqdm(range(0, len(x_test))):
x_test_images.append(read_img(x_test[i]))
x_test_images = np.array(x_test_images)
# Get the CNN model without it's fully connected layer and load no pre trained weights
bottleneck = Xception(include_top=False,
weights=None,
input_shape=(IMG_HEIGHT, IMG_WIDTH, 3),
classes=NUM_CLASSES)
# Add the fully connected layer we need to output the number of classes
model = bottleneck.output
model = Flatten()(model)
model = Dense(256, activation='sigmoid')(model)
model = Dropout(0.5)(model)
model = Dense(64, activation='sigmoid')(model)
predictions = Dense(NUM_CLASSES, activation='sigmoid')(model)
final_model = Model(inputs=[bottleneck.input], outputs=[predictions])
# Compile the model and train it for optimum epcohs. Also insert the validation_data to make sure we don't overfit the model
final_model.compile(SGD(lr=0.1, momentum=0.9), loss='categorical_crossentropy', metrics=['categorical_accuracy'])
history = final_model.fit(x_train_images, y_train, epochs=10, batch_size=64, validation_data=(x_test_images, y_test))
#output vaildation accuracy
scores = final_model.evaluate(x_test_images, y_test, verbose=0)
print("%s: %.2f%%" % (final_model.metrics_names[1], scores[1]*100))
#save the train CNN
final_model.save("Xception_history_10_epochs.h5")
#Output the loss, validation accuracy, categorical accuracy and associated losses for each epoch of training
pd.DataFrame(history.history).to_csv("history_Xception.csv")