Skip to content

Commit 166dccb

Browse files
author
Theodoros Vasileiadis
committed
Make Scatter/Gather simulation interactive
1 parent 09c6c5e commit 166dccb

File tree

3 files changed

+389
-120
lines changed

3 files changed

+389
-120
lines changed

toolset/clustering.py

+127-92
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,20 @@
11
# -*- coding: utf-8 -*-
22
""" Provides methods for applying clustering on a text document collection.
33
"""
4+
import pickle
45
import re
56
import time
6-
import pickle
7+
8+
import nltk
9+
import pandas as pd
10+
from nltk.stem.snowball import SnowballStemmer
11+
from sklearn.cluster import AgglomerativeClustering, MiniBatchKMeans
12+
from sklearn.decomposition import TruncatedSVD
713
from sklearn.feature_extraction.text import TfidfVectorizer
14+
from sklearn.metrics.pairwise import cosine_similarity
815
from sklearn.pipeline import make_pipeline
9-
from sklearn.externals import joblib
10-
from sklearn.decomposition import TruncatedSVD
1116
from sklearn.preprocessing import Normalizer
12-
from sklearn.cluster import MiniBatchKMeans, AgglomerativeClustering
13-
from sklearn.metrics.pairwise import cosine_similarity
14-
import nltk
15-
from nltk.stem.snowball import SnowballStemmer
16-
import pandas as pd
17+
1718

1819
def tokenize(text):
1920
""" Takes a String as input and returns a list of its tokens.
@@ -27,14 +28,18 @@ def tokenize(text):
2728
2829
"""
2930
filtered_tokens = []
30-
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in
31-
nltk.word_tokenize(sent)]
31+
tokens = [
32+
word.lower()
33+
for sent in nltk.sent_tokenize(text)
34+
for word in nltk.word_tokenize(sent)
35+
]
3236
# Remove tokens that do not contain letters.
3337
for token in tokens:
3438
if re.search('[a-zA-Z]', token):
3539
filtered_tokens.append(token)
3640
return filtered_tokens
3741

42+
3843
def stem(tokens):
3944
""" Takes a list of tokens as input and stems each entry.
4045
@@ -52,6 +57,7 @@ def stem(tokens):
5257

5358
return stems
5459

60+
5561
def tokenizer(text):
5662
""" Tokenizes and then stems a given text.
5763
@@ -75,16 +81,17 @@ class ClusterMaker(object):
7581
7682
Args:
7783
n_clusters (int): The number of clusters to be created.
78-
n_dimensions (int): When given a value, specifies the number of dimensions
79-
of the vector space after applying Latent Semantic Analysis. Defaults
80-
to None.
84+
n_dimensions (int): When given a value, specifies the number of
85+
dimensions of the vector space after applying Latent Semantic
86+
Analysis. Defaults to None.
8187
8288
Attributes:
8389
n_clusters (int): The number of clusters to be created.
84-
n_dimensions (int): When given a value, specifies the number of dimensions
85-
of the vector space after applying Latent Semantic Analysis. Defaults
86-
to None.
90+
n_dimensions (int): When given a value, specifies the number of
91+
dimensions of the vector space after applying Latent Semantic
92+
Analysis. Defaults to None.
8793
"""
94+
8895
def __init__(self, corpus):
8996
self.corpus = corpus
9097

@@ -95,74 +102,89 @@ def extract_tfidf(self):
95102
the matrix and the features of the collection are saved in files.
96103
97104
Args:
98-
self.corpus (:obj:'Corpus'): The Corpus object of the document collection.
105+
self.corpus (:obj:'Corpus'): The Corpus object of the document
106+
collection.
99107
100108
Returns:
101-
tfidf_matrix (sparse matrix): The Tf/idf matrix of the document collection.
109+
tfidf (sparse matrix): The Tf/idf matrix of the document
110+
collection.
102111
103112
"""
104113
# Initialize the vectorizer.
105-
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, max_features=10000,
106-
use_idf=True, stop_words='english',
107-
tokenizer=tokenizer, ngram_range=(1, 3))
114+
vectorizer = TfidfVectorizer(
115+
max_df=0.5,
116+
min_df=2,
117+
max_features=10000,
118+
use_idf=True,
119+
stop_words='english',
120+
tokenizer=tokenizer,
121+
ngram_range=(1, 3))
108122
print("DEBUG Created vectorizer")
109123
# Compute the Tf/Idf matrix of the corpus.
110-
tfidf_matrix = vectorizer.fit_transform(self.corpus.document_generator())
124+
tfidf = vectorizer.fit_transform(
125+
self.corpus.document_generator())
111126
# Get feature names from the fitted vectorizer.
112127
features = vectorizer.get_feature_names()
113-
print(tfidf_matrix.shape)
128+
print(tfidf.shape)
114129
print("DEBUG Computed tfidf")
115-
116-
pickle.dump(tfidf_matrix, open('tfidf.pkl', 'wb'))
117-
pickle.dump(features, open('features.pkl', 'wb'))
118-
return tfidf_matrix
119-
120-
def kmeans(self, n_clusters, tfidf_path=None, n_dimensions=None, verbose=False):
130+
pickle.dump(tfidf, open('tfidf.txt', 'wb'))
131+
pickle.dump(features, open('features.txt', 'wb'))
132+
return tfidf
133+
134+
def kmeans(self,
135+
n_clusters,
136+
tfidf=None,
137+
n_dimensions=None,
138+
verbose=False):
121139
""" Applies kmeans clustering on a document collection.
122140
123141
Args:
124-
self.corpus (:obj:'Corpus'): The Corpus object of the document collection.
125-
Defaults to None. Only used when no pre-computed Tf/Idf matrix is
126-
given.
127-
tfidf_path (str): The path to the file containing the Tf/Idf matrix .pkl file.
128-
Defaults to None and in this case the Tf/Idf matrix is calculated.
142+
self.corpus (:obj:'Corpus'): The Corpus object of the document
143+
collection. Defaults to None. Only used when no pre-computed
144+
Tf/Idf matrix is given.
145+
tfidf_path (str): The path to the file containing the Tf/Idf matrix
146+
.pkl file. Defaults to None and in this case the Tf/Idf matrix
147+
is calculated.
129148
verbose (bool): When True additional information will be printed.
130149
Defaults to False.
131150
132151
Returns:
133-
kmodel (:obj:'Kmeans'): Scikit KMeans clustering model.
152+
kmodel (:obj:'Kmeans'): Scikit KMeans clustering model.
134153
135154
"""
136155
print("DEBUG Making cluster model")
137156

138157
# Compute or load Tf/Idf matrix.
139-
if tfidf_path is None:
140-
tfidf_matrix = self.extract_tfidf(self.corpus)
141-
print(tfidf_matrix.shape)
142-
else:
143-
tfidf_matrix = pickle.load(open(tfidf_path, 'rb'))
144-
print(tfidf_matrix.shape)
145-
print('Loaded Tf/Idf matrix.')
158+
if tfidf is None:
159+
tfidf = self.extract_tfidf(self.corpus)
160+
print(tfidf.shape)
161+
162+
print('Loaded Tf/Idf matrix.')
146163

147164
# Apply latent semantic analysis.
148-
if n_dimensions != None:
165+
if n_dimensions is not None:
149166
print('Performing latent semantic analysis')
150167
svd = TruncatedSVD(n_dimensions)
151168
# Normalize SVD results for better clustering results.
152169
lsa = make_pipeline(svd, Normalizer(copy=False))
153-
tfidf_matrix = lsa.fit_transform(tfidf_matrix)
154-
print(tfidf_matrix.shape)
170+
tfidf = lsa.fit_transform(tfidf)
171+
print(tfidf.shape)
155172
print('DEBUG LSA completed')
156173

157174
# Do the clustering.
158175
start_time = time.time()
159-
kmodel = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', n_init=1, max_iter=10,
160-
verbose=True)
176+
kmodel = MiniBatchKMeans(
177+
n_clusters=n_clusters,
178+
init='k-means++',
179+
n_init=1,
180+
max_iter=10,
181+
verbose=True)
161182
print('Clustering with %s' % kmodel)
162-
kmodel.fit(tfidf_matrix)
183+
kmodel.fit(tfidf)
163184
end_time = time.time()
164185

165-
# Create a matching of the clusters and the ids of the documents they contain.
186+
# Create a matching of the clusters and the ids of the documents
187+
# they contain.
166188
cluster_doc = pd.Series()
167189
for i in range(kmodel.n_clusters):
168190
ids = []
@@ -171,83 +193,93 @@ def kmeans(self, n_clusters, tfidf_path=None, n_dimensions=None, verbose=False):
171193
ids.append(docid)
172194
cluster_doc.loc[i] = ids
173195

174-
175-
pickle.dump(kmodel, open('kmodel.pkl', 'wb'))
176-
pickle.dump(cluster_doc, open('cluster_doc.pkl', 'wb'))
177-
178196
if verbose:
179197
# Print some info.
180198
print("Top terms per cluster:")
181-
if n_dimensions != None:
182-
original_space_centroids = svd.inverse_transform(kmodel.cluster_centers_)
199+
if n_dimensions is not None:
200+
original_space_centroids = svd.inverse_transform(
201+
kmodel.cluster_centers_)
183202
order_centroids = original_space_centroids.argsort()[:, ::-1]
184203
else:
185204
order_centroids = kmodel.cluster_centers_.argsort()[:, ::-1]
186205

187206
features = pickle.load(open('features.pkl', 'rb'))
207+
cluster_word = pd.Series()
188208
for i in range(n_clusters):
209+
cluster_features = []
189210
print("Cluster %d:" % i)
190-
for ind in order_centroids[i, :10]:
191-
print(' %s' % features[ind])
192-
print()
193-
print('Clustering completed after ' + str(round((end_time-start_time)/60)) + "' "
194-
+ str(round((end_time-start_time)%60)) + "''")
211+
for ind in order_centroids[i, :100]:
212+
cluster_features.append(features[ind])
213+
cluster_word.loc[i] = cluster_features
214+
215+
pickle.dump(kmodel, open('kmodel.pkl', 'wb'))
216+
pickle.dump(kmodel.cluster_centers_, open('centers.pkl', 'wb'))
217+
pickle.dump(cluster_doc, open('cluster_doc.pkl', 'wb'))
218+
pickle.dump(cluster_word, open('cluster_word.pkl', 'wb'))
219+
220+
print('Clustering completed after ' +
221+
str(round((end_time - start_time) / 60)) + "' " +
222+
str(round((end_time - start_time) % 60)) + "''")
195223

196224
return kmodel
197225

198-
def hac(self, tfidf_path=None, verbose=False):
226+
def hac(self,
227+
n_clusters,
228+
verbose=False,
229+
tfidf=None,
230+
n_dimensions=None):
199231
""" Apply Hierarchical Agglomerative Clustering on a document collection.
200232
201-
This method generates a hierarchical clustering tree for the collection. The leaves
202-
of the tree are clusters consisting of single documents. The tree is then saved by
203-
saving the list of merges in a file.
233+
This method generates a hierarchical clustering tree for the collection.
234+
The leaves of the tree are clusters consisting of single documents.
235+
The tree is then saved by saving the list of merges in a file.
204236
205-
Each entry of this list contains the two tree nodes that were merged to create a
206-
new node and the new node's id. Node ids less than the number of leaves represent
207-
leaves, while node ids greater than the number of leaves indicate internal nodes.
237+
Each entry of this list contains the two tree nodes that were merged to
238+
create a new node and the new node's id. Node ids less than the number
239+
of leaves represent leaves, while node ids greater than the number of
240+
leaves indicate internal nodes.
208241
209242
Args:
210-
self.corpus (:obj:'Corpus'): The Corpus object of the document collection.
211-
Defaults to None. Only used when no pre-computed Tf/Idf matrix is
212-
given.
213-
tfidf_path (str): The path to the file containing the Tf/Idf matrix .pkl file.
214-
Defaults to None and in this case the Tf/Idf matrix is calculated.
243+
self.corpus (:obj:'Corpus'): The Corpus object of the document
244+
collection. Defaults to None. Only used when no pre-computed
245+
Tf/Idf matrix is given.
246+
tfidf_path (str): The path to the file containing the Tf/Idf matrix
247+
.pkl file. Defaults to None and in this case the Tf/Idf matrix
248+
is calculated.
215249
verbose (bool): When True additional information will be printed.
216250
Defaults to False.
217251
218252
Returns:
219-
hac_model (:obj:'AgglomerativeClustering'): The HAC model fitted on the
220-
document collection.
253+
hac_model (:obj:'AgglomerativeClustering'): The HAC model fitted on
254+
the document collection.
221255
222256
"""
223257
# Compute or load Tf/Idf matrix.
224-
if tfidf_path is None:
225-
tfidf_matrix = self.extract_tfidf(self.corpus)
226-
print(tfidf_matrix.shape)
227-
else:
228-
tfidf_matrix = pickle.load(open(tfidf_path, 'rb'))
229-
print(tfidf_matrix.shape)
230-
print('Loaded Tf/Idf matrix.')
258+
if tfidf is None:
259+
tfidf = self.extract_tfidf(self.corpus)
260+
print(tfidf.shape)
261+
262+
print('Loaded Tf/Idf matrix.')
231263

232264
# Apply latent semantic analysis.
233-
if n_dimensions != None:
265+
if n_dimensions is not None:
234266
print('Performing latent semantic analysis')
235267
svd = TruncatedSVD(n_dimensions)
236268
# Normalize SVD results for better clustering results.
237269
lsa = make_pipeline(svd, Normalizer(copy=False))
238-
tfidf_matrix = lsa.fit_transform(tfidf_matrix)
270+
tfidf = lsa.fit_transform(tfidf)
239271

240-
print(tfidf_matrix.shape)
272+
print(tfidf.shape)
241273
print('DEBUG LSA completed')
242274

243-
244275
# Calculate documente distance matrix from Tf/Idf matrix
245-
dist = 1 - cosine_similarity(tfidf_matrix)
276+
dist = 1 - cosine_similarity(tfidf)
246277
print('DEBUG Computed distance matrix.')
247278

248279
start_time = time.time()
249280
# Generate HAC model.
250-
hac_model = AgglomerativeClustering(linkage='ward', n_clusters=n_clusters)
281+
hac_model = AgglomerativeClustering(
282+
linkage='ward', n_clusters=n_clusters)
251283
# Fit the model on the distance matrix.
252284
hac_model.fit(dist)
253285
end_time = time.time()
@@ -257,15 +289,18 @@ def hac(self, tfidf_path=None, verbose=False):
257289
if verbose:
258290
# Visualize cluster model
259291
children = hac_model.children_
260-
merges = [{'node_id': node_id+len(dist),
261-
'right': children[node_id, 0], 'left': children[node_id, 1]
262-
} for node_id in range(0, len(children))]
292+
merges = [{
293+
'node_id': node_id + len(dist),
294+
'right': children[node_id, 0],
295+
'left': children[node_id, 1]
296+
} for node_id in range(0, len(children))]
263297
pickle.dump(merges, open('merges.pkl', 'wb'))
264298
pickle.dump(children, open('children.pkl', 'wb'))
265299

266300
for merge_entry in enumerate(merges):
267301
print(merge_entry[1])
268302

269-
print('Clustering completed after ' + str(round((end_time-start_time)/60)) + "' "
270-
+ str(round((end_time-start_time)%60)) + "''")
303+
print('Clustering completed after ' +
304+
str(round((end_time - start_time) / 60)) + "' " +
305+
str(round((end_time - start_time) % 60)) + "''")
271306
return hac_model

0 commit comments

Comments
 (0)