-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessor.py
105 lines (71 loc) · 3.27 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
'''This module is used to process the data ready to be inputed into the NN'''
import logging
import pandas as pd
import numpy as np
def read_data(f_l):
'''This function takes in the location of the data, and opens the data
labeling all the columns. It also checks that there is no missing data
and that all data is numeric as needed for a NN'''
headers = ['age', 'sex','chest_pain_type','resting_blood_pressure',
'serum_cholestoral', 'fasting_blood_sugar', 'resting_ecg_results',
'max_heart_rate_achieved', 'exercise_induced_angina', 'oldpeak',"slope_of_peak",
'num_of_major_vessels','thal', 'heart_disease']
df = pd.read_csv(f_l, sep=' ', names=headers)
logging.info('read file')
#convert data set boolean to conventional 0, 1
df['heart_disease'].replace({1: 0, 2: 1}, inplace=True)
#checks if there are any null values in data
if df.isnull().sum().any():
logging.warn('Missing some values in data')
#checks if all data is numeric
if not df.shape[1] == df.select_dtypes(include=np.number).shape[1]:
logging.warn('Some data is not numeric')
return df
def train_test_split(df, test_size=0.2, shuffle=True, ):
'''This takes in a data set, and returns it split into training
and testing data aswell as feature and target split.
test_size - the perecent of data to be used for testing
shuffle - whether or not to shuffle data before splitting'''
if test_size > 1 or test_size < 0:
logging.error('Test size is not valid')
if shuffle:
df = df.sample(frac=1).reset_index(drop=True)
num_row_test = round(len(df)*test_size)
test_data = df.iloc[:num_row_test,:].reset_index(drop=True)
train_data = df.iloc[num_row_test:,:].reset_index(drop=True)
x_test, y_test = split_data(test_data)
x_train, y_train = split_data(train_data)
logging.info('split data into training and testing, and feature and target')
return(x_test, y_test, x_train, y_train)
def split_data(df):
'''This takes in a data frame and splits the target from the feature
returning two variables'''
x_data = df.drop(columns=['heart_disease'])
y_data = df['heart_disease']
return(x_data, y_data)
class Standardizer:
'''This class is used to normalize the data'''
mean = None
std = None
def fit(self, df):
'''this method finds the mean and std of each column given'''
self.mean = df.mean()
self.std = df. std()
def transform(self, df):
'''This normalizes the inputed data using the already defined
mean and std'''
norm_df = (df-self.mean)/self.std
return norm_df
def process_data(file_location):
'''Brings all the methods together to return the normalized and
seperated data from the file location specified'''
df = read_data(file_location)
x_test, y_test, x_train, y_train = train_test_split(df)
stdzr = Standardizer()
stdzr.fit(x_train)
logging.info('computed mean and std of traning features')
x_train = stdzr.transform(x_train)
x_test = stdzr.transform(x_test)
logging.info('Transformed features of traning and testing features')
logging.info('processed data')
return(x_test.to_numpy(), y_test.to_numpy(), x_train.to_numpy(), y_train.to_numpy())