-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature_sc.py
56 lines (32 loc) · 1.15 KB
/
feature_sc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#types
'''
Min Max Scaler - normalization
Standard Scaler - standardization
Max Abs Scaler
'''
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.model_selection import train_test_split
df = pd.read_csv("who_suicide_statistics.csv")
print(df.head())
df = pd.get_dummies(df , drop_first=True)
print(df.keys())
df2 = df[[ 'population' , 'sex_male' , 'suicides_no']]
df2 = df2.fillna(df2.mean())
#convert into matrix
X = df2.drop("suicides_no" , axis =1)
y = df2["suicides_no"]
print(X.shape , y.shape)
x_train , x_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state=0)
s_scaler = StandardScaler()
sc = s_scaler.fit(x_train)
x_train = sc.transform(x_train)
x_test = sc.transform(x_test)
# print(x_train)
x_train_df = pd.DataFrame(x_train , columns=['population' , 'sex_male'] )
x_test_df = pd.DataFrame(x_test , columns=['population' , 'sex_male'])
print(x_train_df.describe().round())
#now mean and std = 0 and 1
#same for min_max , for this min = 0 and max = 1 (become) x_train_df.describe().round()
#sns.pairplot(x_train) - no change in data distribution