-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_clean1.py
64 lines (45 loc) · 1.46 KB
/
data_clean1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#read data
df = pd.read_csv('who_suicide_statistics.csv')
#check for null
print(df.isnull().sum())
print(df.info())
plt.figure(figsize=(25 , 25))
# sns.heatmap(df.isnull())
# plt.show()
#check percent of the missing the data
print(df.isnull().sum() / df.shape[0] *100)
#gives the column names of null values
null_var = df.isnull().sum() / df.shape[0] *100
drop_col = null_var[null_var>2 ].keys()
print(drop_col)
# Remove columns having null values
print(df.drop(columns=drop_col))
data = df.drop(columns=drop_col)
# sns.heatmap(data.isnull())
# plt.show()
#drop rows
data2= df.dropna()
print(data2.isnull().sum())
# sns.distplot(df['population'])
# sns.distplot(data2['population'])
# plt.show()
df3 = data2.select_dtypes(include=['int64','float64']).columns
# print(df3)
num_val = ['year', 'suicides_no', 'population']
for i,var in enumerate(num_val):
plt.subplot(9,4,i+1)
sns.distplot(df[var], bins=20)
sns.distplot(data2[var], bins=20)
# plt.show()
df4 = data2.select_dtypes(include=['object']).columns
# print(df4)
# print(data2.shape[0])
df5 = df['country'].value_counts() / data2.shape[0] *100
print(data2['country'].value_counts() / data2.shape[0] *100)
print(df5)
print(pd.concat([df['country'].value_counts() / data2.shape[0] *100 , data2['country'].value_counts() / data2.shape[0] *100] , axis=1 , keys = ['country_org' , 'country_clean']))