-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_preprocessor.py
85 lines (72 loc) · 3.81 KB
/
data_preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import numpy as np
from sklearn.preprocessing import StandardScaler
import pandas as pd
import config
class DataPreprocessor:
@staticmethod
def check_missing_values(data):
"""Check for missing values in the dataset."""
return data.isnull().sum()
@staticmethod
def drop_missing_values(data):
"""Drop rows with missing values."""
return data.dropna()
@staticmethod
def outlier_thresholds(dataframe, variable):
"""Determine the outlier thresholds for a given variable."""
quartile1 = dataframe[variable].quantile(config.Config.OUTLIERS_LOWER_LIMIT)
quartile3 = dataframe[variable].quantile(config.Config.OUTLIERS_UPPER_LIMIT)
interquantile_range = quartile3 - quartile1
up_limit = quartile3 + 1.5 * interquantile_range
low_limit = quartile1 - 1.5 * interquantile_range
return low_limit, up_limit
@staticmethod
def check_outlier(dataframe, col_name):
"""Check if there are outliers in the given column."""
low_limit, up_limit = DataPreprocessor.outlier_thresholds(dataframe, col_name)
return dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None)
@staticmethod
def replace_with_thresholds(dataframe, col_name):
"""Replace outliers with the thresholds."""
low_limit, up_limit = DataPreprocessor.outlier_thresholds(dataframe, col_name)
dataframe.loc[(dataframe[col_name] < low_limit), col_name] = low_limit
dataframe.loc[(dataframe[col_name] > up_limit), col_name] = up_limit
@staticmethod
def preprocess_data(data):
"""Perform data preprocessing including outlier handling, feature engineering, and scaling."""
# Separate categorical and numerical columns
categorical_data = data.select_dtypes(include=['object'])
numerical_data = data.select_dtypes(exclude=['object'])
# Check and handle outliers
for col in numerical_data.columns:
if DataPreprocessor.check_outlier(data, col):
DataPreprocessor.replace_with_thresholds(data, col)
# Feature Engineering
data['NEW_Hits'] = data['Hits'] / data['CHits'] + data['Hits']
data['NEW_RBI'] = data['RBI'] / data['CRBI']
data['NEW_Walks'] = data['Walks'] / data['CWalks']
data['NEW_PutOuts'] = data['PutOuts'] * data['Years']
data["Hits_Success"] = (data["Hits"] / data["AtBat"]) * 100
data["NEW_CRBI*CATBAT"] = data['CRBI'] * data['CAtBat']
data["NEW_RBI"] = data["RBI"] / data["CRBI"]
data["NEW_Chits"] = data["CHits"] / data["Years"]
data["NEW_CHmRun"] = data["CHmRun"] * data["Years"]
data["NEW_CRuns"] = data["CRuns"] / data["Years"]
data["NEW_Chits"] = data["CHits"] * data["Years"]
data["NEW_RW"] = data["RBI"] * data["Walks"]
data["NEW_RBWALK"] = data["RBI"] / data["Walks"]
data["NEW_CH_CB"] = data["CHits"] / data["CAtBat"]
data["NEW_CHm_CAT"] = data["CHmRun"] / data["CAtBat"]
data['NEW_Diff_Atbat'] = data['AtBat'] - (data['CAtBat'] / data['Years'])
data['NEW_Diff_Hits'] = data['Hits'] - (data['CHits'] / data['Years'])
data['NEW_Diff_HmRun'] = data['HmRun'] - (data['CHmRun'] / data['Years'])
data['NEW_Diff_Runs'] = data['Runs'] - (data['CRuns'] / data['Years'])
data['NEW_Diff_RBI'] = data['RBI'] - (data['CRBI'] / data['Years'])
data['NEW_Diff_Walks'] = data['Walks'] - (data['CWalks'] / data['Years'])
# One-Hot Encoding
data = pd.get_dummies(data, columns=categorical_data.columns, drop_first=True)
# Scale the data
scaler = StandardScaler()
numerical_data = data.select_dtypes(exclude=['object'])
data[numerical_data.columns] = scaler.fit_transform(data[numerical_data.columns])
return data