1
1
# -*- coding: utf-8 -*-
2
2
""" Provides methods for applying clustering on a text document collection.
3
3
"""
4
+ import pickle
4
5
import re
5
6
import time
6
- import pickle
7
+
8
+ import nltk
9
+ import pandas as pd
10
+ from nltk .stem .snowball import SnowballStemmer
11
+ from sklearn .cluster import AgglomerativeClustering , MiniBatchKMeans
12
+ from sklearn .decomposition import TruncatedSVD
7
13
from sklearn .feature_extraction .text import TfidfVectorizer
14
+ from sklearn .metrics .pairwise import cosine_similarity
8
15
from sklearn .pipeline import make_pipeline
9
- from sklearn .externals import joblib
10
- from sklearn .decomposition import TruncatedSVD
11
16
from sklearn .preprocessing import Normalizer
12
- from sklearn .cluster import MiniBatchKMeans , AgglomerativeClustering
13
- from sklearn .metrics .pairwise import cosine_similarity
14
- import nltk
15
- from nltk .stem .snowball import SnowballStemmer
16
- import pandas as pd
17
+
17
18
18
19
def tokenize (text ):
19
20
""" Takes a String as input and returns a list of its tokens.
@@ -27,14 +28,18 @@ def tokenize(text):
27
28
28
29
"""
29
30
filtered_tokens = []
30
- tokens = [word .lower () for sent in nltk .sent_tokenize (text ) for word in
31
- nltk .word_tokenize (sent )]
31
+ tokens = [
32
+ word .lower ()
33
+ for sent in nltk .sent_tokenize (text )
34
+ for word in nltk .word_tokenize (sent )
35
+ ]
32
36
# Remove tokens that do not contain letters.
33
37
for token in tokens :
34
38
if re .search ('[a-zA-Z]' , token ):
35
39
filtered_tokens .append (token )
36
40
return filtered_tokens
37
41
42
+
38
43
def stem (tokens ):
39
44
""" Takes a list of tokens as input and stems each entry.
40
45
@@ -52,6 +57,7 @@ def stem(tokens):
52
57
53
58
return stems
54
59
60
+
55
61
def tokenizer (text ):
56
62
""" Tokenizes and then stems a given text.
57
63
@@ -75,16 +81,17 @@ class ClusterMaker(object):
75
81
76
82
Args:
77
83
n_clusters (int): The number of clusters to be created.
78
- n_dimensions (int): When given a value, specifies the number of dimensions
79
- of the vector space after applying Latent Semantic Analysis. Defaults
80
- to None.
84
+ n_dimensions (int): When given a value, specifies the number of
85
+ dimensions of the vector space after applying Latent Semantic
86
+ Analysis. Defaults to None.
81
87
82
88
Attributes:
83
89
n_clusters (int): The number of clusters to be created.
84
- n_dimensions (int): When given a value, specifies the number of dimensions
85
- of the vector space after applying Latent Semantic Analysis. Defaults
86
- to None.
90
+ n_dimensions (int): When given a value, specifies the number of
91
+ dimensions of the vector space after applying Latent Semantic
92
+ Analysis. Defaults to None.
87
93
"""
94
+
88
95
def __init__ (self , corpus ):
89
96
self .corpus = corpus
90
97
@@ -95,74 +102,89 @@ def extract_tfidf(self):
95
102
the matrix and the features of the collection are saved in files.
96
103
97
104
Args:
98
- self.corpus (:obj:'Corpus'): The Corpus object of the document collection.
105
+ self.corpus (:obj:'Corpus'): The Corpus object of the document
106
+ collection.
99
107
100
108
Returns:
101
- tfidf_matrix (sparse matrix): The Tf/idf matrix of the document collection.
109
+ tfidf (sparse matrix): The Tf/idf matrix of the document
110
+ collection.
102
111
103
112
"""
104
113
# Initialize the vectorizer.
105
- vectorizer = TfidfVectorizer (max_df = 0.5 , min_df = 2 , max_features = 10000 ,
106
- use_idf = True , stop_words = 'english' ,
107
- tokenizer = tokenizer , ngram_range = (1 , 3 ))
114
+ vectorizer = TfidfVectorizer (
115
+ max_df = 0.5 ,
116
+ min_df = 2 ,
117
+ max_features = 10000 ,
118
+ use_idf = True ,
119
+ stop_words = 'english' ,
120
+ tokenizer = tokenizer ,
121
+ ngram_range = (1 , 3 ))
108
122
print ("DEBUG Created vectorizer" )
109
123
# Compute the Tf/Idf matrix of the corpus.
110
- tfidf_matrix = vectorizer .fit_transform (self .corpus .document_generator ())
124
+ tfidf = vectorizer .fit_transform (
125
+ self .corpus .document_generator ())
111
126
# Get feature names from the fitted vectorizer.
112
127
features = vectorizer .get_feature_names ()
113
- print (tfidf_matrix .shape )
128
+ print (tfidf .shape )
114
129
print ("DEBUG Computed tfidf" )
115
-
116
- pickle .dump (tfidf_matrix , open ('tfidf.pkl' , 'wb' ))
117
- pickle .dump (features , open ('features.pkl' , 'wb' ))
118
- return tfidf_matrix
119
-
120
- def kmeans (self , n_clusters , tfidf_path = None , n_dimensions = None , verbose = False ):
130
+ pickle .dump (tfidf , open ('tfidf.txt' , 'wb' ))
131
+ pickle .dump (features , open ('features.txt' , 'wb' ))
132
+ return tfidf
133
+
134
+ def kmeans (self ,
135
+ n_clusters ,
136
+ tfidf = None ,
137
+ n_dimensions = None ,
138
+ verbose = False ):
121
139
""" Applies kmeans clustering on a document collection.
122
140
123
141
Args:
124
- self.corpus (:obj:'Corpus'): The Corpus object of the document collection.
125
- Defaults to None. Only used when no pre-computed Tf/Idf matrix is
126
- given.
127
- tfidf_path (str): The path to the file containing the Tf/Idf matrix .pkl file.
128
- Defaults to None and in this case the Tf/Idf matrix is calculated.
142
+ self.corpus (:obj:'Corpus'): The Corpus object of the document
143
+ collection. Defaults to None. Only used when no pre-computed
144
+ Tf/Idf matrix is given.
145
+ tfidf_path (str): The path to the file containing the Tf/Idf matrix
146
+ .pkl file. Defaults to None and in this case the Tf/Idf matrix
147
+ is calculated.
129
148
verbose (bool): When True additional information will be printed.
130
149
Defaults to False.
131
150
132
151
Returns:
133
- kmodel (:obj:'Kmeans'): Scikit KMeans clustering model.
152
+ kmodel (:obj:'Kmeans'): Scikit KMeans clustering model.
134
153
135
154
"""
136
155
print ("DEBUG Making cluster model" )
137
156
138
157
# Compute or load Tf/Idf matrix.
139
- if tfidf_path is None :
140
- tfidf_matrix = self .extract_tfidf (self .corpus )
141
- print (tfidf_matrix .shape )
142
- else :
143
- tfidf_matrix = pickle .load (open (tfidf_path , 'rb' ))
144
- print (tfidf_matrix .shape )
145
- print ('Loaded Tf/Idf matrix.' )
158
+ if tfidf is None :
159
+ tfidf = self .extract_tfidf (self .corpus )
160
+ print (tfidf .shape )
161
+
162
+ print ('Loaded Tf/Idf matrix.' )
146
163
147
164
# Apply latent semantic analysis.
148
- if n_dimensions != None :
165
+ if n_dimensions is not None :
149
166
print ('Performing latent semantic analysis' )
150
167
svd = TruncatedSVD (n_dimensions )
151
168
# Normalize SVD results for better clustering results.
152
169
lsa = make_pipeline (svd , Normalizer (copy = False ))
153
- tfidf_matrix = lsa .fit_transform (tfidf_matrix )
154
- print (tfidf_matrix .shape )
170
+ tfidf = lsa .fit_transform (tfidf )
171
+ print (tfidf .shape )
155
172
print ('DEBUG LSA completed' )
156
173
157
174
# Do the clustering.
158
175
start_time = time .time ()
159
- kmodel = MiniBatchKMeans (n_clusters = n_clusters , init = 'k-means++' , n_init = 1 , max_iter = 10 ,
160
- verbose = True )
176
+ kmodel = MiniBatchKMeans (
177
+ n_clusters = n_clusters ,
178
+ init = 'k-means++' ,
179
+ n_init = 1 ,
180
+ max_iter = 10 ,
181
+ verbose = True )
161
182
print ('Clustering with %s' % kmodel )
162
- kmodel .fit (tfidf_matrix )
183
+ kmodel .fit (tfidf )
163
184
end_time = time .time ()
164
185
165
- # Create a matching of the clusters and the ids of the documents they contain.
186
+ # Create a matching of the clusters and the ids of the documents
187
+ # they contain.
166
188
cluster_doc = pd .Series ()
167
189
for i in range (kmodel .n_clusters ):
168
190
ids = []
@@ -171,83 +193,93 @@ def kmeans(self, n_clusters, tfidf_path=None, n_dimensions=None, verbose=False):
171
193
ids .append (docid )
172
194
cluster_doc .loc [i ] = ids
173
195
174
-
175
- pickle .dump (kmodel , open ('kmodel.pkl' , 'wb' ))
176
- pickle .dump (cluster_doc , open ('cluster_doc.pkl' , 'wb' ))
177
-
178
196
if verbose :
179
197
# Print some info.
180
198
print ("Top terms per cluster:" )
181
- if n_dimensions != None :
182
- original_space_centroids = svd .inverse_transform (kmodel .cluster_centers_ )
199
+ if n_dimensions is not None :
200
+ original_space_centroids = svd .inverse_transform (
201
+ kmodel .cluster_centers_ )
183
202
order_centroids = original_space_centroids .argsort ()[:, ::- 1 ]
184
203
else :
185
204
order_centroids = kmodel .cluster_centers_ .argsort ()[:, ::- 1 ]
186
205
187
206
features = pickle .load (open ('features.pkl' , 'rb' ))
207
+ cluster_word = pd .Series ()
188
208
for i in range (n_clusters ):
209
+ cluster_features = []
189
210
print ("Cluster %d:" % i )
190
- for ind in order_centroids [i , :10 ]:
191
- print (' %s' % features [ind ])
192
- print ()
193
- print ('Clustering completed after ' + str (round ((end_time - start_time )/ 60 )) + "' "
194
- + str (round ((end_time - start_time )% 60 )) + "''" )
211
+ for ind in order_centroids [i , :100 ]:
212
+ cluster_features .append (features [ind ])
213
+ cluster_word .loc [i ] = cluster_features
214
+
215
+ pickle .dump (kmodel , open ('kmodel.pkl' , 'wb' ))
216
+ pickle .dump (kmodel .cluster_centers_ , open ('centers.pkl' , 'wb' ))
217
+ pickle .dump (cluster_doc , open ('cluster_doc.pkl' , 'wb' ))
218
+ pickle .dump (cluster_word , open ('cluster_word.pkl' , 'wb' ))
219
+
220
+ print ('Clustering completed after ' +
221
+ str (round ((end_time - start_time ) / 60 )) + "' " +
222
+ str (round ((end_time - start_time ) % 60 )) + "''" )
195
223
196
224
return kmodel
197
225
198
- def hac (self , tfidf_path = None , verbose = False ):
226
+ def hac (self ,
227
+ n_clusters ,
228
+ verbose = False ,
229
+ tfidf = None ,
230
+ n_dimensions = None ):
199
231
""" Apply Hierarchical Agglomerative Clustering on a document collection.
200
232
201
- This method generates a hierarchical clustering tree for the collection. The leaves
202
- of the tree are clusters consisting of single documents. The tree is then saved by
203
- saving the list of merges in a file.
233
+ This method generates a hierarchical clustering tree for the collection.
234
+ The leaves of the tree are clusters consisting of single documents.
235
+ The tree is then saved by saving the list of merges in a file.
204
236
205
- Each entry of this list contains the two tree nodes that were merged to create a
206
- new node and the new node's id. Node ids less than the number of leaves represent
207
- leaves, while node ids greater than the number of leaves indicate internal nodes.
237
+ Each entry of this list contains the two tree nodes that were merged to
238
+ create a new node and the new node's id. Node ids less than the number
239
+ of leaves represent leaves, while node ids greater than the number of
240
+ leaves indicate internal nodes.
208
241
209
242
Args:
210
- self.corpus (:obj:'Corpus'): The Corpus object of the document collection.
211
- Defaults to None. Only used when no pre-computed Tf/Idf matrix is
212
- given.
213
- tfidf_path (str): The path to the file containing the Tf/Idf matrix .pkl file.
214
- Defaults to None and in this case the Tf/Idf matrix is calculated.
243
+ self.corpus (:obj:'Corpus'): The Corpus object of the document
244
+ collection. Defaults to None. Only used when no pre-computed
245
+ Tf/Idf matrix is given.
246
+ tfidf_path (str): The path to the file containing the Tf/Idf matrix
247
+ .pkl file. Defaults to None and in this case the Tf/Idf matrix
248
+ is calculated.
215
249
verbose (bool): When True additional information will be printed.
216
250
Defaults to False.
217
251
218
252
Returns:
219
- hac_model (:obj:'AgglomerativeClustering'): The HAC model fitted on the
220
- document collection.
253
+ hac_model (:obj:'AgglomerativeClustering'): The HAC model fitted on
254
+ the document collection.
221
255
222
256
"""
223
257
# Compute or load Tf/Idf matrix.
224
- if tfidf_path is None :
225
- tfidf_matrix = self .extract_tfidf (self .corpus )
226
- print (tfidf_matrix .shape )
227
- else :
228
- tfidf_matrix = pickle .load (open (tfidf_path , 'rb' ))
229
- print (tfidf_matrix .shape )
230
- print ('Loaded Tf/Idf matrix.' )
258
+ if tfidf is None :
259
+ tfidf = self .extract_tfidf (self .corpus )
260
+ print (tfidf .shape )
261
+
262
+ print ('Loaded Tf/Idf matrix.' )
231
263
232
264
# Apply latent semantic analysis.
233
- if n_dimensions != None :
265
+ if n_dimensions is not None :
234
266
print ('Performing latent semantic analysis' )
235
267
svd = TruncatedSVD (n_dimensions )
236
268
# Normalize SVD results for better clustering results.
237
269
lsa = make_pipeline (svd , Normalizer (copy = False ))
238
- tfidf_matrix = lsa .fit_transform (tfidf_matrix )
270
+ tfidf = lsa .fit_transform (tfidf )
239
271
240
- print (tfidf_matrix .shape )
272
+ print (tfidf .shape )
241
273
print ('DEBUG LSA completed' )
242
274
243
-
244
275
# Calculate documente distance matrix from Tf/Idf matrix
245
- dist = 1 - cosine_similarity (tfidf_matrix )
276
+ dist = 1 - cosine_similarity (tfidf )
246
277
print ('DEBUG Computed distance matrix.' )
247
278
248
279
start_time = time .time ()
249
280
# Generate HAC model.
250
- hac_model = AgglomerativeClustering (linkage = 'ward' , n_clusters = n_clusters )
281
+ hac_model = AgglomerativeClustering (
282
+ linkage = 'ward' , n_clusters = n_clusters )
251
283
# Fit the model on the distance matrix.
252
284
hac_model .fit (dist )
253
285
end_time = time .time ()
@@ -257,15 +289,18 @@ def hac(self, tfidf_path=None, verbose=False):
257
289
if verbose :
258
290
# Visualize cluster model
259
291
children = hac_model .children_
260
- merges = [{'node_id' : node_id + len (dist ),
261
- 'right' : children [node_id , 0 ], 'left' : children [node_id , 1 ]
262
- } for node_id in range (0 , len (children ))]
292
+ merges = [{
293
+ 'node_id' : node_id + len (dist ),
294
+ 'right' : children [node_id , 0 ],
295
+ 'left' : children [node_id , 1 ]
296
+ } for node_id in range (0 , len (children ))]
263
297
pickle .dump (merges , open ('merges.pkl' , 'wb' ))
264
298
pickle .dump (children , open ('children.pkl' , 'wb' ))
265
299
266
300
for merge_entry in enumerate (merges ):
267
301
print (merge_entry [1 ])
268
302
269
- print ('Clustering completed after ' + str (round ((end_time - start_time )/ 60 )) + "' "
270
- + str (round ((end_time - start_time )% 60 )) + "''" )
303
+ print ('Clustering completed after ' +
304
+ str (round ((end_time - start_time ) / 60 )) + "' " +
305
+ str (round ((end_time - start_time ) % 60 )) + "''" )
271
306
return hac_model
0 commit comments