-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster_tests.py
135 lines (107 loc) · 5.25 KB
/
cluster_tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import numpy as np
from loss_functions import *
from manifold_functions import map_dataset_to_mfd
from scipy.optimize import minimize
import random
import sklearn.metrics
import scipy.io
def kmeans_randomly_partition_data(FQB, k):
assigned_labels = []
for _ in FQB:
assigned_labels.append(random.choice(range(k)))
return assigned_labels
def kmeans_cost_of_assignment(FQB, assigned_labels, mfd_dist_generic, mfd_integrand, k):
total_cost = 0
pts_per_clust = [0 for _ in range(k)]
for lblx in assigned_labels: ## this is what i want
pts_per_clust[lblx] += 1
for idxx, FQx in enumerate(FQB):
for idxy, FQy in enumerate(FQB):
if assigned_labels[idxx] == assigned_labels[idxy]:
total_cost += mfd_dist_generic(FQx,FQy, mfd_integrand) / (2*pts_per_clust[assigned_labels[idxx]])
return total_cost
def kmeans_generic(FQB, k, mfd_dist_generic, mfd_integrand):
assigned_labels = kmeans_randomly_partition_data(FQB, k)
converged = False
while not converged:
converged = True
for idxx, FQx in enumerate(FQB):
curr_cost = kmeans_cost_of_assignment(FQB, assigned_labels, mfd_dist_generic, mfd_integrand, k)
min_new_cost = curr_cost
min_new_label_x = assigned_labels[idxx]
for kidx in range(k):
new_labels = assigned_labels
new_labels[idxx] = kidx
new_proposed_cost = kmeans_cost_of_assignment(FQB, new_labels, mfd_dist_generic, mfd_integrand, k)
if new_proposed_cost < min_new_cost:
min_new_cost = new_proposed_cost
min_new_label_x = kidx
converged = False
assigned_labels[idxx] = min_new_label_x
return assigned_labels
def do_cluster_test(train_ratio, k, reg, lmbd, Bnew_euc, fxn_euc, fxn_euc_dist, Bnew_mfd, fxn_mfd, fxn_mfd_dist, fxn_integrand, true_labels, datasetname):
npts = len(Bnew_euc)
dim_euc = len(Bnew_euc[0])
dim_mfd = len(Bnew_mfd[0])
# split data into training and testing
idx_tr = []
idx_ts = []
euc_data_tr = []
euc_data_ts = []
mfd_data_tr = []
mfd_data_ts = []
labels_tr = []
labels_ts = []
for i in range(npts):
if np.random.random() < train_ratio:
idx_tr.append(i)
euc_data_tr.append(Bnew_euc[i])
mfd_data_tr.append(Bnew_mfd[i])
labels_tr.append(true_labels[i])
else:
idx_ts.append(i)
euc_data_ts.append(Bnew_euc[i])
mfd_data_ts.append(Bnew_mfd[i])
labels_ts.append(true_labels[i])
# learn Q using mmc
Q0_euc = np.diag([1 for _ in range(dim_euc)])
Q0_mfd = np.diag([1 for _ in range(dim_mfd)])
euc_res_Powell = minimize(mmc_loss_generic, Q0_euc, args=(reg, lmbd, fxn_euc, fxn_euc_dist, None, euc_data_tr, labels_tr), method='Powell', options={'disp': True})
mfd_res_Powell = minimize(mmc_loss_generic, Q0_mfd, args=(reg, lmbd, fxn_mfd, fxn_mfd_dist, fxn_integrand, mfd_data_tr, labels_tr), method='Powell', options={'disp': True})
euc_Qnew = euc_res_Powell.x.reshape(dim_euc, dim_euc)
mfd_Qnew = mfd_res_Powell.x.reshape(dim_mfd, dim_mfd)
euc_Qdata_ts = map_dataset_to_mfd(euc_data_ts, euc_Qnew, fxn_euc)
mfd_Qdata_ts = map_dataset_to_mfd(mfd_data_ts, mfd_Qnew, fxn_mfd)
euc_Idata_ts = map_dataset_to_mfd(euc_data_ts, Q0_euc, fxn_euc)
mfd_Idata_ts = map_dataset_to_mfd(mfd_data_ts, Q0_mfd, fxn_mfd)
scipy.io.savemat('./Q'+datasetname+'.mat', mdict = {'Q': mfd_Qnew, 'data': mfd_Idata_ts})
# run k-means
K = len(np.unique(true_labels)) # number of unique labels is the value of K in K-means
euc_lab_ts = kmeans_generic(euc_Idata_ts, K, fxn_euc_dist, None)
euc_Qlab_ts = kmeans_generic(euc_Qdata_ts, K, fxn_euc_dist, None)
mfd_lab_ts = kmeans_generic(mfd_Idata_ts, K, fxn_mfd_dist, fxn_integrand)
mfd_Qlab_ts = kmeans_generic(mfd_Qdata_ts, K, fxn_mfd_dist, fxn_integrand)
# evaluate k-means results
err_euc_orig = eval_cluster_quality(labels_ts, euc_lab_ts)
err_euc_qlrn = eval_cluster_quality(labels_ts, euc_Qlab_ts)
err_mfd_orig = eval_cluster_quality(labels_ts, mfd_lab_ts)
err_mfd_qlrn = eval_cluster_quality(labels_ts, mfd_Qlab_ts)
return err_euc_orig, err_euc_qlrn, err_mfd_orig, err_mfd_qlrn
def eval_cluster_quality(true_labels, assigned_labels):
true_labels = [i[0] for i in true_labels]
ARI = sklearn.metrics.adjusted_rand_score(true_labels, assigned_labels)
NMI = sklearn.metrics.normalized_mutual_info_score(true_labels, assigned_labels)
err = [ARI, NMI]
return err
def do_cluster_tests_all(nrounds, train_ratio, k, reg, lmbd, Bnew_euc, fxn_euc, fxn_euc_dist, Bnew_mfd, fxn_mfd, fxn_mfd_dist, fxn_integrand, true_labels, datasetname):
err_euc_orig = []
err_euc_qlrn = []
err_mfd_orig = []
err_mfd_qlrn = []
for r in range(nrounds):
eeo,eeq,emo,emq = do_cluster_test(train_ratio, k, reg, lmbd, Bnew_euc, fxn_euc, fxn_euc_dist, Bnew_mfd, fxn_mfd, fxn_mfd_dist, fxn_integrand, true_labels, datasetname)
err_euc_orig.append(eeo)
err_euc_qlrn.append(eeq)
err_mfd_orig.append(emo)
err_mfd_qlrn.append(emq)
return err_euc_orig, err_euc_qlrn, err_mfd_orig, err_mfd_qlrn