Skip to content

Commit 596ddfa

Browse files
committed
cleaned up main file
1 parent 5035ecc commit 596ddfa

File tree

1 file changed

+69
-65
lines changed

1 file changed

+69
-65
lines changed

main.py

+69-65
Original file line numberDiff line numberDiff line change
@@ -12,86 +12,90 @@
1212
1313
2) Data preparation -- DataPrep
1414
- Perform Exploratory Data Analysis (EDA)
15-
1615
- Deal with null values based on the results of EDA
1716
- Encode categorical data
1817
- Scale numerical features only -- skip encoded categorical columns
1918
- Split the data set into training and testing set
2019
21-
5) Model selection and training
20+
3) Model selection and training
2221
- Models for this project:
2322
- Linear Regression
24-
- Use regularization to avoid under/overfitting
2523
- Multi-Class Logistic Regression
2624
- K-Nearest Neighbors (KNN)
27-
- Choose a gradient descent variant and use it for parameter optimization
2825
29-
6) Hyperparameter tuning
30-
- Use grid search / random search for tuning
31-
- Evaluate models on validation dataset to prevent overfitting
32-
33-
6) Model Evaluation
34-
- Evaluating the best model and the hyperparameters which give the best accuracy
26+
4) Model Evaluation
27+
- Evaluating the best model accuracy percentages
3528
"""
3629

37-
# numerical data .skew() outputs
38-
# CustomerID: 0.0
39-
# Age: -0.040893617755290594
40-
# Tenure: -0.12605627128660457
41-
# Usage Frequency: 0.03754298828827117
42-
# Support Calls: -0.19285414431875514
43-
# Payment Delay: -0.35071402695836457
44-
# Total Spend: 0.04774634961486376
45-
# Last Interaction: 0.005111808910520158
46-
# Churn: 0.10540842751365004
4730

48-
data = pd.read_csv("customer_churn_dataset-testing-master.csv")
49-
data = data.drop("CustomerID", axis=1) # dropping unnecessary columns
31+
def prepare_data(file_path):
32+
"""
33+
load and prepare the dataset
34+
"""
35+
data = pd.read_csv(file_path)
36+
data = data.drop("CustomerID", axis=1) # dropping unnecessary columns
5037

51-
"""
52-
Data Preparation
53-
"""
54-
dp = DataPrep(data)
38+
dp = DataPrep(data)
39+
dp.clean_data()
40+
dp.transform()
5541

56-
dp.clean_data()
57-
dp.transform()
42+
return dp.get_datasets()
5843

59-
x_train, y_train, x_test, y_test = dp.get_datasets()
6044

45+
def evaluate_linear(x_train, y_train, x_test, y_test):
46+
linear_model = models.LinearRegression()
6147

62-
"""
63-
Training the models
64-
"""
65-
# Linear Regression Model
66-
linear_model = models.LinearRegression()
67-
68-
linear_losses = linear_model.fit(x_train, y_train)
69-
continuous_predictions = linear_model.predict(x_test)
70-
linear_predictions = linear_model.predict_class(x_test)
71-
print(f"Accuracy of Linear Regression: {models.accuracy(linear_predictions, y_test): .2f}%")
72-
73-
# Logistic Regression Model
74-
log_model = models.LogisticRegression()
75-
76-
log_losses = log_model.fit(x_train, y_train)
77-
log_predictions = log_model.predict(x_test)
78-
print(f"Accuracy of Logistic Regression: {models.accuracy(log_predictions, y_test): .2f}%")
79-
80-
# K-Nearest Neighbors Model
81-
knn_model = models.KNearestNeighbor()
82-
83-
knn_model.fit(x_train, y_train)
84-
knn_predictions = knn_model.predict(x_test)
85-
print(f"Accuracy of KNN model: {models.accuracy(knn_predictions, y_test): .2f}%")
86-
87-
88-
# plotting the errors for all three
89-
x_axis_linear = np.arange(len(linear_losses))
90-
x_axis_logistic = np.arange(len(log_losses))
91-
plt.plot(x_axis_linear, linear_losses, label="Linear Regression Loss")
92-
plt.plot(x_axis_logistic, log_losses, label="Logistic Regression Loss")
93-
plt.xlabel("Epochs")
94-
plt.ylabel("Loss")
95-
plt.title("Loss Over Time")
96-
plt.legend()
97-
plt.show()
48+
linear_losses = linear_model.fit(x_train, y_train)
49+
continuous_predictions = linear_model.predict(x_test)
50+
linear_predictions = linear_model.predict_class(x_test)
51+
print(f"Accuracy of Linear Regression: {models.accuracy(linear_predictions, y_test): .2f}%")
52+
53+
return linear_losses
54+
55+
56+
def evaluate_logistic(x_train, y_train, x_test, y_test):
57+
log_model = models.LogisticRegression()
58+
59+
log_losses = log_model.fit(x_train, y_train)
60+
log_predictions = log_model.predict(x_test)
61+
print(f"Accuracy of Logistic Regression: {models.accuracy(log_predictions, y_test): .2f}%")
62+
63+
return log_losses
64+
65+
66+
def evaluate_knn(x_train, y_train, x_test, y_test):
67+
knn_model = models.KNearestNeighbor()
68+
69+
knn_model.fit(x_train, y_train)
70+
knn_predictions = knn_model.predict(x_test)
71+
print(f"Accuracy of KNN model: {models.accuracy(knn_predictions, y_test): .2f}%")
72+
73+
74+
def plot_losses(linear_loss, log_loss):
75+
x_axis_linear = np.arange(len(linear_loss))
76+
x_axis_logistic = np.arange(len(log_loss))
77+
plt.plot(x_axis_linear, linear_loss, label="Linear Regression Loss")
78+
plt.plot(x_axis_logistic, log_loss, label="Logistic Regression Loss")
79+
plt.xlabel("Epochs")
80+
plt.ylabel("Loss")
81+
plt.title("Loss Over Time")
82+
plt.legend()
83+
plt.show()
84+
85+
86+
def main():
87+
# Data preparation
88+
file_path = "customer_churn_dataset-testing-master.csv"
89+
x_train, y_train, x_test, y_test = prepare_data(file_path)
90+
91+
# Training the models
92+
linear_loss = evaluate_linear(x_train, y_train, x_test, y_test)
93+
log_loss = evaluate_logistic(x_train, y_train, x_test, y_test)
94+
evaluate_knn(x_train, y_train, x_test, y_test)
95+
96+
# Plotting losses
97+
plot_losses(linear_loss, log_loss)
98+
99+
100+
if __name__ == "__main__":
101+
main()

0 commit comments

Comments
 (0)