-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPCA.py
More file actions
98 lines (81 loc) · 3.61 KB
/
PCA.py
File metadata and controls
98 lines (81 loc) · 3.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("/Users/arya/Downloads/cont_var_91_del.csv", header = None)
color = pd.read_csv("/Users/arya/Downloads/cont_var_91_del_labels1.csv", header = None).iloc[:,25].values
numbers = df.iloc[:, 25].values
X = df.iloc[:, :25].values
'''
PCA Calculation Functions
'''
def standardize_eigen(X):
mean_vec = np.mean(X, axis=0)
X_centered = X - mean_vec
cov = np.cov(X_centered.T)
eig_val, eig_vec = np.linalg.eig(cov)
sorted_idx = np.argsort(eig_val)[::-1]
return eig_val[sorted_idx], eig_vec[:, sorted_idx], X_centered
def explained_variance(eig_val):
total = np.sum(eig_val)
return total, np.cumsum(eig_val / total)
def pca_transform(X_centered, eig_vec, n_components=2):
return np.dot(X_centered, eig_vec[:, :n_components])
'''
PCA Function
'''
def pca(X,components,Y,colors, specific_number = None):
eig_val, eig_vec, X_centered = standardize_eigen(X)
X_pca = pca_transform(X_centered, eig_vec, n_components=components)
color = ""
#Creates the plot figure; you may have to configure these dimensions to your display screen.
plt.figure(figsize=(15, 10))
# This is the color dictionary that is predefined with the links between the labels and the colors
colordict = {"Omi+": "blue", "Omi+ ?":"brown", "ENJ":"red", "Omi- ?":"green", "Omi-":"purple"}
legend_array = []
# This is how the colors are designated before plotting the points, to ensure that the labels are mapped to the right color.
if type(colors) != str:
for i, label in enumerate(colors):
color = colordict[colors[i]]
plt.scatter(X_pca[i, components - 2],
X_pca[i, components - 1], # components - 2 would refer to PC1, components - 1 is PC2 (only if components is 2)
c = color,
label=label,
s=20)
else:
color = colors
plt.scatter(X_pca[i, components - 2],
X_pca[i, components - 1], # components - 2 would refer to PC1, components - 1 is PC2 (only if components is 2)
c = color,
label=label,
s=20)
#Checks if you have defined a specific number to be labeled, otherwise it will label all the points
if specific_number != None:
for j in specific_number:
i = list(Y).index(j)
plt.text(X_pca[i, components - 2],
X_pca[i, components - 1],
str(Y[i]),
fontsize=8,
alpha=0.6)
else:
for i in range(len(X_pca)):
plt.text(X_pca[i, components - 2],
X_pca[i, components - 1],
str(Y[i]),
fontsize=8,
alpha=0.6)
# Defines the legend by using the labels and colors
for i, j in colordict.items():
legend_array.append(mpatches.Patch(color = j, label = i))
plt.xlabel(f"Principal Component {components - 1}")
plt.ylabel(f"Principal Component {components}")
plt.title(f"PCA - Components {components - 1} vs {components}")
plt.grid(True)
plt.tight_layout()
plt.legend(title = "Group", handles = legend_array)
plt.show()
# This is an optional scaler that scales the numbers to a specific range. It does not affect the clustering of the points, only the axis values.
X_scaled = StandardScaler().fit_transform(X)
pca(X_scaled,2, numbers, colors = color, specific_number=[13])