|
12 | 12 |
|
13 | 13 | 2) Data preparation -- DataPrep
|
14 | 14 | - Perform Exploratory Data Analysis (EDA)
|
15 |
| - |
16 | 15 | - Deal with null values based on the results of EDA
|
17 | 16 | - Encode categorical data
|
18 | 17 | - Scale numerical features only -- skip encoded categorical columns
|
19 | 18 | - Split the data set into training and testing set
|
20 | 19 |
|
21 |
| -5) Model selection and training |
| 20 | +3) Model selection and training |
22 | 21 | - Models for this project:
|
23 | 22 | - Linear Regression
|
24 |
| - - Use regularization to avoid under/overfitting |
25 | 23 | - Multi-Class Logistic Regression
|
26 | 24 | - K-Nearest Neighbors (KNN)
|
27 |
| - - Choose a gradient descent variant and use it for parameter optimization |
28 | 25 |
|
29 |
| -6) Hyperparameter tuning |
30 |
| - - Use grid search / random search for tuning |
31 |
| - - Evaluate models on validation dataset to prevent overfitting |
32 |
| -
|
33 |
| -6) Model Evaluation |
34 |
| - - Evaluating the best model and the hyperparameters which give the best accuracy |
| 26 | +4) Model Evaluation |
| 27 | + - Evaluating the best model accuracy percentages |
35 | 28 | """
|
36 | 29 |
|
37 |
| -# numerical data .skew() outputs |
38 |
| -# CustomerID: 0.0 |
39 |
| -# Age: -0.040893617755290594 |
40 |
| -# Tenure: -0.12605627128660457 |
41 |
| -# Usage Frequency: 0.03754298828827117 |
42 |
| -# Support Calls: -0.19285414431875514 |
43 |
| -# Payment Delay: -0.35071402695836457 |
44 |
| -# Total Spend: 0.04774634961486376 |
45 |
| -# Last Interaction: 0.005111808910520158 |
46 |
| -# Churn: 0.10540842751365004 |
47 | 30 |
|
48 |
| -data = pd.read_csv("customer_churn_dataset-testing-master.csv") |
49 |
| -data = data.drop("CustomerID", axis=1) # dropping unnecessary columns |
| 31 | +def prepare_data(file_path): |
| 32 | + """ |
| 33 | + load and prepare the dataset |
| 34 | + """ |
| 35 | + data = pd.read_csv(file_path) |
| 36 | + data = data.drop("CustomerID", axis=1) # dropping unnecessary columns |
50 | 37 |
|
51 |
| -""" |
52 |
| -Data Preparation |
53 |
| -""" |
54 |
| -dp = DataPrep(data) |
| 38 | + dp = DataPrep(data) |
| 39 | + dp.clean_data() |
| 40 | + dp.transform() |
55 | 41 |
|
56 |
| -dp.clean_data() |
57 |
| -dp.transform() |
| 42 | + return dp.get_datasets() |
58 | 43 |
|
59 |
| -x_train, y_train, x_test, y_test = dp.get_datasets() |
60 | 44 |
|
| 45 | +def evaluate_linear(x_train, y_train, x_test, y_test): |
| 46 | + linear_model = models.LinearRegression() |
61 | 47 |
|
62 |
| -""" |
63 |
| -Training the models |
64 |
| -""" |
65 |
| -# Linear Regression Model |
66 |
| -linear_model = models.LinearRegression() |
67 |
| - |
68 |
| -linear_losses = linear_model.fit(x_train, y_train) |
69 |
| -continuous_predictions = linear_model.predict(x_test) |
70 |
| -linear_predictions = linear_model.predict_class(x_test) |
71 |
| -print(f"Accuracy of Linear Regression: {models.accuracy(linear_predictions, y_test): .2f}%") |
72 |
| - |
73 |
| -# Logistic Regression Model |
74 |
| -log_model = models.LogisticRegression() |
75 |
| - |
76 |
| -log_losses = log_model.fit(x_train, y_train) |
77 |
| -log_predictions = log_model.predict(x_test) |
78 |
| -print(f"Accuracy of Logistic Regression: {models.accuracy(log_predictions, y_test): .2f}%") |
79 |
| - |
80 |
| -# K-Nearest Neighbors Model |
81 |
| -knn_model = models.KNearestNeighbor() |
82 |
| - |
83 |
| -knn_model.fit(x_train, y_train) |
84 |
| -knn_predictions = knn_model.predict(x_test) |
85 |
| -print(f"Accuracy of KNN model: {models.accuracy(knn_predictions, y_test): .2f}%") |
86 |
| - |
87 |
| - |
88 |
| -# plotting the errors for all three |
89 |
| -x_axis_linear = np.arange(len(linear_losses)) |
90 |
| -x_axis_logistic = np.arange(len(log_losses)) |
91 |
| -plt.plot(x_axis_linear, linear_losses, label="Linear Regression Loss") |
92 |
| -plt.plot(x_axis_logistic, log_losses, label="Logistic Regression Loss") |
93 |
| -plt.xlabel("Epochs") |
94 |
| -plt.ylabel("Loss") |
95 |
| -plt.title("Loss Over Time") |
96 |
| -plt.legend() |
97 |
| -plt.show() |
| 48 | + linear_losses = linear_model.fit(x_train, y_train) |
| 49 | + continuous_predictions = linear_model.predict(x_test) |
| 50 | + linear_predictions = linear_model.predict_class(x_test) |
| 51 | + print(f"Accuracy of Linear Regression: {models.accuracy(linear_predictions, y_test): .2f}%") |
| 52 | + |
| 53 | + return linear_losses |
| 54 | + |
| 55 | + |
| 56 | +def evaluate_logistic(x_train, y_train, x_test, y_test): |
| 57 | + log_model = models.LogisticRegression() |
| 58 | + |
| 59 | + log_losses = log_model.fit(x_train, y_train) |
| 60 | + log_predictions = log_model.predict(x_test) |
| 61 | + print(f"Accuracy of Logistic Regression: {models.accuracy(log_predictions, y_test): .2f}%") |
| 62 | + |
| 63 | + return log_losses |
| 64 | + |
| 65 | + |
| 66 | +def evaluate_knn(x_train, y_train, x_test, y_test): |
| 67 | + knn_model = models.KNearestNeighbor() |
| 68 | + |
| 69 | + knn_model.fit(x_train, y_train) |
| 70 | + knn_predictions = knn_model.predict(x_test) |
| 71 | + print(f"Accuracy of KNN model: {models.accuracy(knn_predictions, y_test): .2f}%") |
| 72 | + |
| 73 | + |
| 74 | +def plot_losses(linear_loss, log_loss): |
| 75 | + x_axis_linear = np.arange(len(linear_loss)) |
| 76 | + x_axis_logistic = np.arange(len(log_loss)) |
| 77 | + plt.plot(x_axis_linear, linear_loss, label="Linear Regression Loss") |
| 78 | + plt.plot(x_axis_logistic, log_loss, label="Logistic Regression Loss") |
| 79 | + plt.xlabel("Epochs") |
| 80 | + plt.ylabel("Loss") |
| 81 | + plt.title("Loss Over Time") |
| 82 | + plt.legend() |
| 83 | + plt.show() |
| 84 | + |
| 85 | + |
| 86 | +def main(): |
| 87 | + # Data preparation |
| 88 | + file_path = "customer_churn_dataset-testing-master.csv" |
| 89 | + x_train, y_train, x_test, y_test = prepare_data(file_path) |
| 90 | + |
| 91 | + # Training the models |
| 92 | + linear_loss = evaluate_linear(x_train, y_train, x_test, y_test) |
| 93 | + log_loss = evaluate_logistic(x_train, y_train, x_test, y_test) |
| 94 | + evaluate_knn(x_train, y_train, x_test, y_test) |
| 95 | + |
| 96 | + # Plotting losses |
| 97 | + plot_losses(linear_loss, log_loss) |
| 98 | + |
| 99 | + |
| 100 | +if __name__ == "__main__": |
| 101 | + main() |
0 commit comments