-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnoisy_data.py
171 lines (126 loc) · 6.93 KB
/
noisy_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# Run this file to apply the decision tree and k-NN algorithms on the 6 data sets when the data sets are injected with
# noisy data (set to 20% of all the data) to discover if overfitting happens. (Overfitting is a
# phenomenon where the model is trained to have very small (0%) training error but the test error actually
# increases). Try to use the validation set (which also contains “noise”) to choose the best hyper-parameters (such
# as pruning levels for decision trees, k for k-NN). Report the test accuracy for such models as the most reliable
# estimate of the predictive accuracy of your model for the future unseen test data.
from data_creation import figure_one, figure_two
from helpers import noise
from supervised_learning_algorithms import k_NN_algorithm, decision_tree
import matplotlib.pyplot as plt
# Helper function to plot the k values with their accuracies
def plot_k_subplots(cv_k_data: list, test_k_data: list):
k_fig, axs = plt.subplots(2, 3, figsize=(20, 7))
# Make fullscreen
k_manager = plt.get_current_fig_manager()
k_manager.resize(*k_manager.window.maxsize())
# Set the title of the entire plot window
k_fig.suptitle(
"Blue Line = Test Accuracy on Cross-Validation with 10 folds, Red = Test Accuracy on Unseen Test Data",
fontsize=24)
# Variables for controlling subplot positioning
column = 0
row = 0
# Set axis limits and behaviour
for index, k_figure in enumerate(cv_k_data):
axs[row, column].plot(range(len(k_figure)), k_figure, 'tab:blue')
axs[row, column].plot(range(len(test_k_data[index])), test_k_data[index], 'tab:red')
axs[row, column].set_title("k Accuracy - Figure " + str(index + 1))
# If at the end of the row, go one row down
if column < 2:
column += 1
else:
column = 0
row += 1
plt.show()
# Adds noise to the data
def noise_creation(data_list: list, percentage_of_noise: float):
number_of_points = (int((len(data_list) * percentage_of_noise) / 3))
noise_data = []
# Black dot noise
x_list, y_list = noise.generate(number_of_points, -20, 20, -60, 60)
# Add the points with the respective colour to the main list of points
for element in (plt.scatter(x_list, y_list, c="purple", s=20).get_offsets()):
noise_data.append([element.tolist(), "purple"])
# Green dot noise
x_list, y_list = noise.generate(number_of_points, -20, 20, -60, 60)
# Add the points with the respective colour to the main list of points
for element in (plt.scatter(x_list, y_list, c="yellow", s=20).get_offsets()):
noise_data.append([element.tolist(), "yellow"])
# Red dot noise
x_list, y_list = noise.generate(number_of_points, -20, 20, -60, 60)
# Add the points with the respective colour to the main list of points
for element in (plt.scatter(x_list, y_list, c="orange", s=20).get_offsets()):
noise_data.append([element.tolist(), "orange"])
data_list = data_list + noise_data
return data_list
if __name__ == '__main__':
# Program start message
print("Program started.\n")
# Stores a list of all the cross-validation accuracies for each figure for knn
all_cv_knn_accuracies = []
# Stores a list of all the test accuracies for each figure for knn
all_test_knn_accuracies = []
# Stores a list of the chosen tuned hyperparameters in knn
tuned_k_values = []
# Stores the test data accuracy of each of the tuned hyperparameters in knn
knn_test_accuracies = []
# Stores the test data accuracy of each decision tree
all_dtree_test_accuracies = []
# Stores a list of the chosen tuned hyperparameters in knn
tuned_dtree_hyperparameters = []
fig = plt.figure(figsize=(15, 10))
# Make fullscreen
manager = plt.get_current_fig_manager()
manager.resize(*manager.window.maxsize())
# Set the title of the entire plot window
fig.suptitle("The Data Generated and Plotted for Figures 1 and 2", fontsize=24)
# The percentage of noise to be added
percntge_of_noise = 0.20
print("The current percentage of noise is set to: " + str(percntge_of_noise * 100) + "%")
# Generating data message
print("Generating data and adding noise...", end="")
# Add the data of each figure to a list
data = [noise_creation(figure_one.create_data(fig), percntge_of_noise),
noise_creation(figure_two.create_data(fig), percntge_of_noise)]
print("done\n")
# Loop through each figure to apply the k-NN algorithm and decision tree algorithm
for figure_number, figure in enumerate(data):
# Running k-NN algorithm message
print("Running the k-NN algorithm on Figure " + str(figure_number + 1) + "...", end="")
cv_k_a, td_k_a, tuned_k, test_acc = k_NN_algorithm.knn_algorithm(figure)
print("done\n")
all_cv_knn_accuracies.append(cv_k_a)
all_test_knn_accuracies.append(td_k_a)
tuned_k_values.append(tuned_k)
knn_test_accuracies.append(test_acc)
print("Running the decision tree algorithm on Figure " + str(figure_number + 1) + "...", end="")
dtree_accuracy, dtree_hyperparameters = decision_tree.decision_tree(figure)
all_dtree_test_accuracies.append(dtree_accuracy)
tuned_dtree_hyperparameters.append(dtree_hyperparameters)
print("done\n")
# Showing data message
print("Plotting and showing generated data...", end="")
plt.show(block=False)
print("done\n")
for i in range(0, 2):
print("Figure " + str(i + 1) + ":")
print("\tTrained hyperparameters for Figure " + str(i + 1) + " k-NN algorithm are:")
print("\t\tk=" + str(tuned_k_values[i]))
print("\t\t" + str(
knn_test_accuracies[i]) + " -> k-NN test accuracy on future unseen test data with tuned hyperparameters")
print("\tTrained hyperparameters for Figure " + str(i + 1) + " Decision Tree algorithm are:")
print("\t\tk=" + str(tuned_dtree_hyperparameters[i]))
print("\t\t" + str(all_dtree_test_accuracies[
i]) + " -> Decision Tree test accuracy on future unseen test data with tuned "
"hyperparameters")
if all_dtree_test_accuracies[i] > knn_test_accuracies[i]:
print("\n\tFor Figure " + str(i + 1) + " the Decision Tree model performs better.\n")
elif all_dtree_test_accuracies[i] < knn_test_accuracies[i]:
print("\n\tFor Figure " + str(i + 1) + " the k-NN model performs better.\n")
else:
print("\n\tFor Figure " + str(i + 1) + " the both models perform the same.\n")
# Showing k findings message
print("\nPlotting and showing k value data...", end="")
plot_k_subplots(all_cv_knn_accuracies, all_test_knn_accuracies)
print("done\n")