-
Notifications
You must be signed in to change notification settings - Fork 0
/
2019.04.19_distances_for_tsne.R
277 lines (202 loc) · 11.6 KB
/
2019.04.19_distances_for_tsne.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# Tf-Idf+Scale(columns) + tSNE(Euclid+PCA)
v1 <- function(t) {
T13 = t[[13]] %>%
group_by(Author) %>%
summarise( nr_of_posts = n(), text = paste0(Content, collapse = " ")) %>%
arrange(desc(nr_of_posts))
init = textTinyR::sparse_term_matrix$new(vector_data = T13$text, file_data = NULL, document_term_matrix = TRUE)
tm = init$Term_Matrix(sort_terms = TRUE, to_lower = T, remove_punctuation_vector = F,
remove_numbers = T, trim_token = T, split_string = T,
stemmer = "porter2_stemmer",
split_separator = " \r\n\t.,;:()?!//", remove_stopwords = T,
language = "english", min_num_char = 3, max_num_char = 100,
print_every_rows = 100000, normalize = NULL, tf_idf = T,
threads = 6, verbose = T)
m_adj <- as.matrix(init$Term_Matrix_Adjust(sparsity_thresh = 0.98498730956292))
dim(m_adj)
m_adj_uniq = unique(m_adj)
dim(m_adj_uniq)
tsne = Rtsne(scale(m_adj_uniq), dims = 2, perplexity=30, verbose=TRUE, max_iter = 2000)
png("tsne_v1.png", width = 1800, height = 1800, res = 300)
plot(tsne$Y[,1], tsne$Y[,2],main="Tf-Idf+Scale(columns) + tSNE(Euclid+PCA)",xlab="Dim1", ylab = "Dim2", col = adjustcolor(1, alpha=0.5), pch=16)
dev.off()
}
# v2: Tf + Scale + tSNE(Euclid+PCA)
# v2_2: Tf + tSNE(Euclid+PCA)
v2 <- function(t) {
T13 = t[[13]] %>%
group_by(Author) %>%
summarise( nr_of_posts = n(), text = paste0(Content, collapse = " ")) %>%
arrange(desc(nr_of_posts))
init = textTinyR::sparse_term_matrix$new(vector_data = T13$text, file_data = NULL, document_term_matrix = TRUE)
tm = init$Term_Matrix(sort_terms = TRUE, to_lower = T, remove_punctuation_vector = F,
remove_numbers = T, trim_token = T, split_string = T,
stemmer = "porter2_stemmer",
split_separator = " \r\n\t.,;:()?!//", remove_stopwords = T,
language = "english", min_num_char = 3, max_num_char = 100,
print_every_rows = 100000, normalize = NULL, tf_idf = F,
threads = 6, verbose = T)
m_adj <- as.matrix(init$Term_Matrix_Adjust(sparsity_thresh = 0.98498730956292))
dim(m_adj)
m_adj_uniq = unique(m_adj)
dim(m_adj_uniq)
tsne = Rtsne(m_adj_uniq, dims = 2, perplexity=30, verbose=TRUE, max_iter = 2000)
tsne_scale = Rtsne(scale(m_adj_uniq), dims = 2, perplexity=30, verbose=TRUE, max_iter = 2000)
png("tsne_v2.png", width = 3200, height = 1600, res = 300)
par(mfrow=c(1,2))
plot(tsne$Y[,1], tsne$Y[,2],main="Tf + tSNE(Euclid+PCA)",xlab="Dim1", ylab = "Dim2", col = adjustcolor(1, alpha=0.5), pch=16)
plot(tsne_scale$Y[,1], tsne_scale$Y[,2],main="Tf + Scale + tSNE(Euclid+PCA)",xlab="Dim1", ylab = "Dim2", col = adjustcolor(1, alpha=0.5), pch=16)
dev.off()
}
v3 <- function(t) {
T13 = t[[13]] %>%
group_by(Author) %>%
summarise( nr_of_posts = n(), text = paste0(Content, collapse = " ")) %>%
arrange(desc(nr_of_posts))
init = textTinyR::sparse_term_matrix$new(vector_data = T13$text, file_data = NULL, document_term_matrix = TRUE)
tm = init$Term_Matrix(sort_terms = TRUE, to_lower = T, remove_punctuation_vector = F,
remove_numbers = T, trim_token = T, split_string = T,
stemmer = "porter2_stemmer",
split_separator = " \r\n\t.,;:()?!//", remove_stopwords = T,
language = "english", min_num_char = 3, max_num_char = 100,
print_every_rows = 100000, normalize = NULL, tf_idf = F,
threads = 6, verbose = T)
m_adj <- as.matrix(init$Term_Matrix_Adjust(sparsity_thresh = 0.98498730956292))
dim(m_adj)
m_adj_uniq = unique(m_adj)
dim(m_adj_uniq)
euclidean_scale = dist(scale(m_adj_uniq), method = "euclidean")
euclidean = dist(m_adj_uniq, method = "euclidean")
pc_scale = cmdscale(euclidean_scale, k = 50)
pc = cmdscale(euclidean, k = 50)
tsne_scale = Rtsne(pc_scale, dims = 2, perplexity=30, verbose=TRUE, max_iter = 2000, is_distance = F, pca = F)
tsne = Rtsne(pc, dims = 2, perplexity=30, verbose=TRUE, max_iter = 2000, is_distance = F, pca = F)
png("tsne_v3.png", width = 3200, height = 1600, res = 300)
par(mfrow=c(1,2))
plot(tsne$Y[,1], tsne$Y[,2],main="Tf + Euclidean + PCA + tSNE",xlab="Dim1", ylab = "Dim2", col = adjustcolor(1, alpha=0.5), pch=16)
plot(tsne_scale$Y[,1], tsne_scale$Y[,2],main="Tf + Scale + Euclidean + PCA + tSNE",xlab="Dim1", ylab = "Dim2", col = adjustcolor(1, alpha=0.5), pch=16)
dev.off()
}
v4 <- function(t) {
T13 = t[[13]] %>%
group_by(Author) %>%
summarise( nr_of_posts = n(), text = paste0(Content, collapse = " ")) %>%
arrange(desc(nr_of_posts))
init = textTinyR::sparse_term_matrix$new(vector_data = T13$text, file_data = NULL, document_term_matrix = TRUE)
tm = init$Term_Matrix(sort_terms = TRUE, to_lower = T, remove_punctuation_vector = F,
remove_numbers = T, trim_token = T, split_string = T,
stemmer = "porter2_stemmer",
split_separator = " \r\n\t.,;:()?!//", remove_stopwords = T,
language = "english", min_num_char = 3, max_num_char = 100,
print_every_rows = 100000, normalize = NULL, tf_idf = F,
threads = 6, verbose = T)
m_adj <- as.matrix(init$Term_Matrix_Adjust(sparsity_thresh = 0.91))
dim(m_adj)
m_adj_uniq = unique(m_adj)
dim(m_adj_uniq)
m_adj_uniq = m_adj_uniq[rowSums(m_adj_uniq) > 0,]
dim(m_adj_uniq)
cosine_scale = cosine(t(normalize_input(m_adj_uniq)))
cosine = cosine(t(m_adj_uniq))
pc_scale = cmdscale(cosine_scale, k = 50)
pc = cmdscale(cosine, k = 50)
tsne_scale = Rtsne(pc_scale, dims = 2, perplexity=30, verbose=TRUE, max_iter = 2000, is_distance = F, pca = F)
tsne = Rtsne(pc, dims = 2, perplexity=30, verbose=TRUE, max_iter = 2000, is_distance = F, pca = F)
png("tsne_v4.png", width = 3200, height = 1600, res = 300)
par(mfrow=c(1,2))
plot(tsne$Y[,1], tsne$Y[,2],main="Tf + Cosine + PCA + tSNE",xlab="Dim1", ylab = "Dim2", col = adjustcolor(1, alpha=0.5), pch=16)
plot(tsne_scale$Y[,1], tsne_scale$Y[,2],main="Tf + Scale + Cosine + PCA + tSNE",xlab="Dim1", ylab = "Dim2", col = adjustcolor(1, alpha=0.5), pch=16)
dev.off()
}
v5 <- function(t) {
T13 = t[[13]] %>%
group_by(Author) %>%
summarise( nr_of_posts = n(), text = paste0(Content, collapse = " ")) %>%
arrange(desc(nr_of_posts))
init = textTinyR::sparse_term_matrix$new(vector_data = T13$text, file_data = NULL, document_term_matrix = TRUE)
tm = init$Term_Matrix(sort_terms = TRUE, to_lower = T, remove_punctuation_vector = F,
remove_numbers = T, trim_token = T, split_string = T,
stemmer = "porter2_stemmer",
split_separator = " \r\n\t.,;:()?!//", remove_stopwords = T,
language = "english", min_num_char = 3, max_num_char = 100,
print_every_rows = 100000, normalize = NULL, tf_idf = T,
threads = 6, verbose = T)
m_adj <- as.matrix(init$Term_Matrix_Adjust(sparsity_thresh = 0.9))
dim(m_adj)
m_adj_uniq = unique(m_adj)
dim(m_adj_uniq)
m_adj_uniq = m_adj_uniq[rowSums(m_adj_uniq) > 0,]
dim(m_adj_uniq)
cosine_scale = cosine(t(scale(m_adj_uniq)))
cosine = cosine(t(m_adj_uniq))
pc_scale = cmdscale(cosine_scale, k = 50)
pc = cmdscale(cosine, k = 50)
tsne_scale = Rtsne(pc_scale, dims = 2, perplexity=30, verbose=TRUE, max_iter = 2000, is_distance = F, pca = F)
tsne = Rtsne(pc, dims = 2, perplexity=30, verbose=TRUE, max_iter = 2000, is_distance = F, pca = F)
png("tsne_v5.png", width = 3200, height = 1600, res = 300)
par(mfrow=c(1,2))
plot(tsne$Y[,1], tsne$Y[,2],main="Tf-Idf + Cosine + PCA + tSNE",xlab="Dim1", ylab = "Dim2", col = adjustcolor(1, alpha=0.5), pch=16)
plot(tsne_scale$Y[,1], tsne_scale$Y[,2],main="Tf+Idf + Scale + Cosine + PCA + tSNE",xlab="Dim1", ylab = "Dim2", col = adjustcolor(1, alpha=0.5), pch=16)
dev.off()
}
T13 = t[[13]] %>%
group_by(Author) %>%
summarise( nr_of_posts = n(), text = paste0(Content, collapse = " ")) %>%
arrange(desc(nr_of_posts))
init = textTinyR::sparse_term_matrix$new(vector_data = T13$text, file_data = NULL, document_term_matrix = TRUE)
tm = init$Term_Matrix(sort_terms = FALSE, to_lower = T, remove_punctuation_vector = F,
remove_numbers = T, trim_token = T, split_string = T,
stemmer = "porter2_stemmer",
split_separator = " \r\n\t.,;:()?!//", remove_stopwords = T,
language = "english", min_num_char = 3, max_num_char = 100,
print_every_rows = 100000, normalize = NULL, tf_idf = F,
threads = 3, verbose = T)
m_adj <- as.matrix(init$Term_Matrix_Adjust(sparsity_thresh = 0.98498730956292))
dim(m_adj)
m_adj_uniq = unique(m_adj)
dim(m_adj_uniq)
m_adj_uniq = m_adj_uniq[rowSums(m_adj_uniq) > 0,]
dim(m_adj_uniq)
cosine.dist = 1 - cosine(t(m_adj_uniq))
after.pca = cmdscale(cosine.dist, 50)
tsne = Rtsne(after.pca, dims = 2, perplexity=50, verbose=TRUE, max_iter = 2000, is_distance = F, pca = F)
png("tsne_tf_dist_PCA_1463words.png", width = 1600, height = 1600, res = 300)
plot(tsne$Y[,1], tsne$Y[,2],main="Tf + Cosine + PCA + tSNE",xlab="Dim1", ylab = "Dim2", col = adjustcolor(1, alpha=0.5), pch=16)
dev.off()
plotTSNE(tsne, m_adj_uniq[,1:39])
performTSNE(t, 13, 0.9825932, T)
#### Before ####
T_ = t[[13]] %>%
group_by(Author) %>%
summarise( nr_of_posts = n(), text = paste0(Content, collapse = " ")) %>%
arrange(desc(nr_of_posts))
init = textTinyR::sparse_term_matrix$new(vector_data = T_$text, file_data = NULL, document_term_matrix = TRUE)
tm = init$Term_Matrix(sort_terms = FALSE, to_lower = T, remove_punctuation_vector = F,
remove_numbers = T, trim_token = T, split_string = T,
stemmer = "porter2_stemmer",
split_separator = " \r\n\t.,;:()?!//", remove_stopwords = T,
language = "english", min_num_char = 3, max_num_char = 100,
print_every_rows = 100000, normalize = NULL, tf_idf = T,
threads = 6, verbose = T)
m_adj <- as.matrix(init$Term_Matrix_Adjust(sparsity_thresh = 0.98498730956292))
dim(m_adj)
m_adj = unique(m_adj)
dim(m_adj)
#euclidean_sim <- dist(scale(m_adj), method = "euclidean")
m_adj = m_adj[rowSums(m_adj) > 0,]
#pc = cmdscale(dist(scale(m_adj), method = "euclidean"), k = 50)
pc_cosine = cmdscale(cosine(t(scale(m_adj))), k = 50)
#cosine_similarity_uniq = unique(t(unique(cosine_sim)))
tsne = Rtsne(pp$x, dims = 2, perplexity=30, verbose=TRUE, max_iter = 2000, is_distance = F, pca = F)
png("tsne.png", width = 1800, height = 1800, res = 300)
plot(tsne$Y[,1], tsne$Y[,2],main="2D Representation - T1 (15-cut); perplexity = 5",xlab="Dim1", ylab = "Dim2", col = adjustcolor(1, alpha=0.5), pch=16)
dev.off()
tsne = Rtsne(cosine_similarity_uniq, dims = 2, perplexity=30, verbose=TRUE, max_iter = 3000)
png("tsne_30.png", width = 1800, height = 1800, res = 300)
plot(tsne$Y[,1], tsne$Y[,2],main="2D Representation - T1 (15-cut); perplexity = 30",xlab="Dim1", ylab = "Dim2", col = adjustcolor(1, alpha=0.5), pch=16)
dev.off()
tsne = Rtsne(cosine_similarity_uniq, dims = 2, perplexity=50, verbose=TRUE, max_iter = 3000)
png("tsne_50.png", width = 1800, height = 1800, res = 300)
plot(tsne$Y[,1], tsne$Y[,2],main="2D Representation - T1 (15-cut); perplexity = 50",xlab="Dim1", ylab = "Dim2", col = adjustcolor(1, alpha=0.5), pch=16)
dev.off()
####