-
Notifications
You must be signed in to change notification settings - Fork 0
/
tfidf_means.py
117 lines (93 loc) · 4.03 KB
/
tfidf_means.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from evolve_main import prep_data, process_text
'''
Analyze neighborhood overview text by neighborhood by creating a figure that displays the top 20 words by average tf-idf score to understand the presence of common terms/features and derive key characteristics of each neighborhood. The below code was developed with reference to Thomas Buhrmann's blog post, Analyzing tf-idf results in scikit-learn (https://buhrmann.github.io/tfidf-analysis.html)
'''
def tf_idf(listings):
hoods = ['Jamaica Plain', 'South End', 'Back Bay', 'Fenway', 'Dorchester']
df_hoods = listings[listings['neighbourhood_cleansed'].isin(hoods)]
df_hoods = df_hoods.reset_index(drop=True)
df_hoods = df_hoods[df_hoods['neighborhood_overview'].notnull()]
nhoods = df_hoods['neighbourhood_cleansed'].values
X = df_hoods['neighborhood_overview'].values
new_X = []
for overview in X:
new_X.append(process_text(overview))
tfidf = TfidfVectorizer(max_features=2000, stop_words='english')
vec_pipe = Pipeline([
('vec', tfidf)])
X_trans = vec_pipe.fit_transform(new_X)
vec = vec_pipe.named_steps['vec']
features = vec.get_feature_names()
return X_trans, nhoods, features
def top_tfidf_feats(row, features, top_n=20):
'''
Get the top n tfidf values in a row and return them with their corresponding feature names
'''
topn_ids = np.argsort(row)[::-1][:top_n]
top_feats = [(features[i], row[i]) for i in topn_ids]
df = pd.DataFrame(top_feats)
df.columns = ['feature', 'tfidf']
return df
def top_feats_in_doc(X, features, row_id, top_n=20):
'''
Return the top n tfidf features in a specific document
'''
row = np.squeeze(X[row_id].toarray())
return top_tfidf_feats(row, features, top_n)
def top_mean_feats(X, features, grp_ids=None, min_tfidf=0.1, top_n=20):
'''
Return the top n features that on average are most important among documents in rows indentified by indices in grp_ids
'''
if grp_ids:
X_new = X[grp_ids].toarray()
else:
X_new = X.toarray()
X_new[X_new < min_tfidf] = 0
tfidf_means = np.mean(X_new, axis=0)
return top_tfidf_feats(tfidf_means, features, top_n)
def top_feats_by_nhood(X, nhoods, features, min_tfidf=0.1, top_n=20):
'''
Return a list of dataframes (dfs), where each df holds the top_n features and their mean tfidf value calculated across documents with the same class label (neighborhood name)
'''
dfs = []
labels = np.unique(nhoods)
for label in labels:
ids = np.where(nhoods==label)
feats_df = top_mean_feats(X, features, ids, min_tfidf=min_tfidf, top_n=top_n)
feats_df.label = label
dfs.append(feats_df)
return dfs
def plot_tfidf_dfs(dfs):
'''
Plot the dataframes returned by the function top_feats_by_nhood()
'''
fig = plt.figure(figsize=(12, 8), facecolor="w")
x = np.arange(len(dfs[0]))
for i, df in enumerate(dfs):
ax = fig.add_subplot(1, len(dfs), i+1)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.set_frame_on(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.set_title(str(df.label), fontsize=14)
ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
ax.barh(x, df.tfidf, align='center', color='blue', alpha=0.5)
ax.set_yticks(x)
ax.set_ylim([-1, x[-1]+1])
yticks = ax.set_yticklabels(df.feature)
plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.9, wspace=0.52)
plt.suptitle('Mean Tf-Idf Score for Top 20 Words by Neighborhood Overview', fontsize=16)
# plt.show()
plt.savefig('img/tfidf_means.png', dpi=400)
plt.close()
if __name__ == '__main__':
listings, calendar = prep_data()
X, nhoods, features = tf_idf(listings)
dfs = top_feats_by_nhood(X, nhoods, features)
plot_tfidf_dfs(dfs)