-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyze_glassdoor_job_data_anqi.py
276 lines (226 loc) · 12.7 KB
/
analyze_glassdoor_job_data_anqi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import os
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
# %matplotlib inline
plt.style.use('seaborn-whitegrid')
plt.rcParams['savefig.facecolor']='white'
params = {'figure.figsize': (18,12),
'axes.titlesize': 20}
plt.rcParams.update(params)
def check_dir(directory):
if not os.path.exists(directory):
os.mkdir(directory)
# bar chart for distribution
def plot_pie_chart(df, column, save = False):
temp = df[column].value_counts()
temp = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
values = temp['values']
labels = temp['labels']
fig = plt.figure(figsize=(12, 12), facecolor='w')
bbox_props = dict(boxstyle="round", fc="w", ec="0.5", alpha=0.6)
patches = plt.pie(values, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance = 0.8,
textprops={'fontsize': 20, 'bbox': bbox_props})
plt.axis('equal')
title = 'Distribution of ' + column
plt.title(title, loc = 'center', y=1.08, fontsize = 25)
plt.tight_layout()
if save:
saved_path = os.path.join(plot_dir, title).replace(' ', '-')
fig.savefig(saved_path, dpi=200, bbox_inches="tight")
else:
plt.show()
plt.close()
def plot_box(df, column, save = False):
temp = df[column].value_counts()
temp = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
values = temp['values']
labels = temp['labels']
fig = plt.figure(figsize=(12, 12), facecolor='w')
bbox_props = dict(boxstyle="round", fc="w", ec="0.5", alpha=0.6)
patches = plt.pie(values, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance = 0.8,
textprops={'fontsize': 20, 'bbox': bbox_props})
plt.axis('equal')
title = 'Distribution of ' + column
plt.title(title, loc = 'center', y=1.08, fontsize = 25)
plt.tight_layout()
if save:
saved_path = os.path.join(plot_dir, title).replace(' ', '-')
fig.savefig(saved_path, dpi=200, bbox_inches="tight")
else:
plt.show()
plt.close()
#
# # bubble plot
# def plot_bubble_chart(df, column, save = False):
#
# selected_categories = df.groupby([df.columns[0]])['Total Search'].sum().reset_index().sort_values('Total Search').tail(12)[df.columns[0]]
# df = df[df[df.columns[0]].isin(selected_categories)]
#
# fig = sns.relplot(x="Total Search", y=column, size="Total Search", hue="Total Search",
# sizes=(1000, 10000), data=df, height=8, aspect=12/8, legend = False)
# bbox_props = dict(boxstyle="round", fc="w", ec="0.5", alpha=0.6)
# for line in range(0,df.shape[0]):
# plt.text(df["Total Search"].iloc[line], df[column].iloc[line], df[df.columns[0]].iloc[line],
# bbox=bbox_props, horizontalalignment='left', size='large', color='black', fontsize = 20)
# title = column + ' vc Total Search by ' + category
# plt.xlim(0, df['Total Search'].max()*1.1)
# plt.ylim(0, df[column].max()*1.1)
# x = np.linspace(0, df['Total Search'].max()*1.1, df['Total Search'].max()*1.1)
# y = [df[column].mean()] * len(x)
# plt.plot(x, y, linewidth = 2, color = 'red')
# plt.title(title, loc = 'center', y=1.08, fontsize = 25)
#
# if save:
# saved_path = os.path.join(plot_dir, title).replace(' ', '-')
# fig.savefig(saved_path, dpi=200, bbox_inches="tight")
# else:
# plt.show()
#
# plt.close()
data_path = 'data/Glassdoor_Jobs_Data.csv'
plot_dir = 'plot'
check_dir(plot_dir)
jobs_df = pd.read_csv(data_path, encoding = "ISO-8859-1")
print(jobs_df.shape)
print(jobs_df.isnull().sum())
print(jobs_df.nunique())
print(jobs_df.describe())
jobs_df.head()
# Data Cleaning ================================================================
jobs_df = jobs_df.rename(columns = {'Comapny Description': 'Company Description'})
# Industry
# Two companies categorize itself under 2 industries
df = jobs_df.drop_duplicates(['Company', 'Industry']).groupby(['Company'])['Industry'].count().reset_index()
companies_more_than_1_industry = list(df[df['Industry'] > 1]['Company'])
companies_more_than_1_industry
jobs_df[jobs_df['Company'] == companies_more_than_1_industry[0]]['Industry'].value_counts()
jobs_df[jobs_df['Industry'] == 'Lending']['Company'].unique()
citibank_industry = jobs_df[jobs_df['Company'] == companies_more_than_1_industry[0]]['Industry'].value_counts().index[1]
jobs_df['Industry'] = jobs_df['Industry'].replace('Lending', citibank_industry)
jobs_df['Company'] = jobs_df['Company'].replace('Citibank NA', 'Citibank')
jobs_df[jobs_df['Company'] == 'Citibank NA'].shape
jobs_df[jobs_df['Company'] == companies_more_than_1_industry[1]]['Industry'].value_counts()
jobs_df[jobs_df['Industry'] == 'Wholesale']['Company'].unique()
jobs_df[jobs_df['Industry'] == 'Biotech & Pharmaceuticals']['Company'].unique()
jobs_df['Industry'] = jobs_df['Industry'].replace('Wholesale', 'Biotech & Pharmaceuticals')
jobs_df['Industry'] = jobs_df['Industry'].fillna('NA')
# Company: Take out NAs
jobs_df = jobs_df[~pd.isna(jobs_df['Company'])]
# Company Description: Concatenate description for company that has more than one company description
df = jobs_df.drop_duplicates(['Company', 'Company Description']).groupby(['Company'])['Company Description'].count().reset_index()
companies_more_than_1_desc = list(df[df['Company Description'] > 1]['Company'])
companies_more_than_1_desc
desc_concat = ' '.join(jobs_df[jobs_df['Company'] == companies_more_than_1_desc[0]]['Company Description'].value_counts().index)
jobs_df[jobs_df['Company'] == companies_more_than_1_desc[0]]
jobs_df.loc[jobs_df['Company'] == companies_more_than_1_desc[0], 'Company Description'] = desc_concat
jobs_df['Company Description'] = jobs_df['Company Description'].fillna('NA')
# Job Description: NAs
jobs_df['Job Description'] = jobs_df['Job Description'].fillna('NA')
# Head Quarter
df = jobs_df.drop_duplicates(['Company', 'Head Quarter']).groupby(['Company'])['Head Quarter'].count().reset_index()
companies_more_than_1_hq = list(df[df['Head Quarter'] > 1]['Company'])
companies_more_than_1_hq
for company in companies_more_than_1_hq:
hq = jobs_df[jobs_df['Company'] == company]['Head Quarter'].value_counts().index[0]
jobs_df.loc[jobs_df['Company'] == company, 'Head Quarter'] = hq
jobs_df['Head Quarter'] = jobs_df['Head Quarter'].fillna('NA')
# Company Size: NAs
df = jobs_df.drop_duplicates(['Company', 'Company Size']).groupby(['Company'])['Company Size'].count().reset_index()
companies_more_than_1_size = list(df[df['Company Size'] > 1]['Company'])
for company in companies_more_than_1_size:
size = jobs_df[jobs_df['Company'] == company]['Company Size'].value_counts().index[0]
jobs_df.loc[jobs_df['Company'] == company, 'Company Size'] = size
jobs_df = jobs_df[~pd.isna(jobs_df['Company Size'])]
sizes = list(jobs_df['Company Size'].value_counts().index)
sizes_map = {size: int(sum([int(size.split()[0]), int(size.split()[2])]) / 2) for size in sizes[1:]}
sizes_map[sizes[0]] = 25000
jobs_df['Company Size (Num)'] = jobs_df['Company Size'].map(sizes_map)
# Company Revenue
revenues = list(jobs_df['Company Revenue'].value_counts().index)
revenue_map = {revenue: int(sum([int(revenue.split()[0][1:]), int(revenue.split()[2][1:])]) / 2) * 10**(6 if revenue.split()[-2] == 'million' else 9) for revenue in revenues[1:-1]}
revenue_map[revenues[0]] = 25 * 10**9
revenue_map[revenues[-1]] = 0.5 * 10**6
jobs_df['Company Revenue (Num)'] = jobs_df['Company Revenue'].map(revenue_map)
# Overall Rating
df = jobs_df.drop_duplicates(['Company', 'Overall Rating']).groupby(['Company'])['Overall Rating'].count().reset_index()
companies_more_than_1_rating = list(df[df['Overall Rating'] > 1]['Company'])
companies_more_than_1_rating
for company in companies_more_than_1_rating:
rating = jobs_df[jobs_df['Company'] == company]['Overall Rating'].value_counts().index[0]
jobs_df.loc[jobs_df['Company'] == company, 'Overall Rating'] = rating
company_no_rating = list(set(jobs_df[pd.isna(jobs_df['Overall Rating'])]['Company']))
for company in company_no_rating:
ratings = list(jobs_df[jobs_df['Company'] == company]['Overall Rating'].value_counts().index)
if len(ratings) != 0:
jobs_df.loc[jobs_df['Company'] == company, 'Overall Rating'] = ratings[0]
rating_mode = jobs_df['Overall Rating'].mode()[0]
jobs_df['Overall Rating'].fillna(rating_mode, inplace = True)
sns.distplot(jobs_df['Overall Rating'])
jobs_df.loc[jobs_df['Overall Rating'] == -1, 'Overall Rating'] = rating_mode
# Founded Year
sns.distplot(jobs_df['Founded Year'])
df = jobs_df.drop_duplicates(['Company', 'Founded Year']).groupby(['Company'])['Founded Year'].count().reset_index()
companies_more_than_1_year = list(df[df['Founded Year'] > 1]['Company'])
for company in companies_more_than_1_year:
years = list(jobs_df[jobs_df['Company'] == company]['Founded Year'].value_counts().index)
if 0 in years:
years.remove(0)
jobs_df.loc[jobs_df['Company'] == company, 'Founded Year'] = years[0]
jobs_df['Founded Year'].replace(0, np.nan, inplace = True)
year_mode = jobs_df['Founded Year'].mode()[0]
jobs_df['Founded Year'].fillna(year_mode, inplace = True)
# Data Plotting ================================================================
# Univariate
plot_pie_chart(jobs_df, 'Company Size')
plot_pie_chart(jobs_df, 'Company Size', save = True)
plot_pie_chart(jobs_df, 'Company Revenue')
plot_pie_chart(jobs_df, 'Company Revenue', save = True)
sns.distplot(jobs_df['Overall Rating'])
sns.distplot(jobs_df['Founded Year'])
sns.catplot(x="Company Size", kind="count", data=jobs_df, order=list(jobs_df[~pd.isna(jobs_df['Company Size (Num)'])].sort_values('Company Size (Num)')['Company Size'].unique()), height=8, aspect=12/8)
sns.catplot(x="Company Revenue", kind="count", data=jobs_df, order=list(jobs_df[~pd.isna(jobs_df['Company Revenue (Num)'])].sort_values('Company Revenue (Num)')['Company Revenue'].unique()), height=12, aspect=12/8)
sns.catplot(x="Company Revenue", kind="count", data=jobs_df, order=list(jobs_df.sort_values('Company Revenue (Num)')['Company Revenue'].unique()), height=8, aspect=12/8)
jobs_df.columns
Index(['Job Title', 'Company', 'Salary 50th Percentile',
'Salary 10th Percentile', 'Salary 90th Percentile', 'Overall Rating',
'Company Description', 'Founded Year', 'Head Quarter', 'Company Size',
'Industry', 'Company Revenue', 'Job Description', 'Company Size (Num)',
'Company Revenue (Num)'],
dtype='object')
# Bivariate
sns.catplot(x="Company Size", y="Company Revenue (Num)", order=list(jobs_df.sort_values('Company Size (Num)')['Company Size'].unique()), kind="box", data=jobs_df, height=8, aspect=12/8)
sns.catplot(x="Company Size", y="Salary 50th Percentile", order=list(jobs_df.sort_values('Company Size (Num)')['Company Size'].unique()), kind="box", data=jobs_df, height=8, aspect=12/8)
sns.catplot(x="Company Size", y="Salary 10th Percentile", order=list(jobs_df.sort_values('Company Size (Num)')['Company Size'].unique()), kind="box", data=jobs_df, height=8, aspect=12/8)
sns.catplot(x="Company Size", y="Salary 90th Percentile", order=list(jobs_df.sort_values('Company Size (Num)')['Company Size'].unique()), kind="box", data=jobs_df, height=8, aspect=12/8)
df = pd.concat([jobs_df[['Company Size', 'Salary 10th Percentile']].rename(columns = {'Salary 10th Percentile': 'Salary'}), jobs_df[['Company Size', 'Salary 50th Percentile']].rename(columns = {'Salary 50th Percentile': 'Salary'}), jobs_df[['Company Size', 'Salary 90th Percentile']].rename(columns = {'Salary 90th Percentile': 'Salary'})])
df['Salary Type'] = ['Salary 10th Percentile'] * jobs_df.shape[0] + ['Salary 50th Percentile'] * jobs_df.shape[0] + ['Salary 90th Percentile'] * jobs_df.shape[0]
fig = sns.catplot(x="Company Size", y="Salary", hue = 'Salary Type', order=list(jobs_df.sort_values('Company Size (Num)')['Company Size'].unique()), kind="box", data=df, height=8, aspect=12/8)
title = "Salary Range for Company with Different Size"
plt.title(title, loc = 'center', y=1.08, fontsize = 25)
saved_path = os.path.join(plot_dir, title).replace(' ', '-')
fig.savefig(saved_path, dpi=200, bbox_inches="tight")
# Plot Word Cloud
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
text = " ".join(review for review in jobs_df['Job Description'])
print ("There are {} words in the combination of all review.".format(len(text)))
# Create stopword list:
stopwords = set(STOPWORDS)
# stopwords.update([])
wordcloud = WordCloud(width=1600, height=800, stopwords=stopwords, background_color="white").generate(text)
plt.figure( figsize=(20,10), facecolor='w')
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
# plt.show()
plt.savefig('wordcloud.png', bbox_inches='tight')