Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions Comparison-gbt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from boosting.gradient_boosting import GradientBoostingTree

def load_dataset(file_path):
"""
Load a dataset from a CSV file.
"""
try:
data = pd.read_csv(file_path)
if data.empty:
raise ValueError("Dataset is empty. Please provide a valid dataset.")
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
return X, y
except Exception as e:
print(f"Error loading dataset: {e}")
exit()

def preprocess_data(X, y):
"""
Preprocess the dataset: scale features and return scaled X and y.
"""
X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0)
X_scaled = (X - X_mean) / X_std

y_mean = np.mean(y)
y_std = np.std(y)
y_scaled = (y - y_mean) / y_std

return X_scaled, y_scaled, y_mean, y_std

def evaluate_model(y_true, y_pred):
"""
Evaluate the model using MSE, MAE, and R² metrics.
"""
mse = np.mean((y_true - y_pred) ** 2)
mae = np.mean(np.abs(y_true - y_pred))
r2 = 1 - (np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2))
return mse, mae, r2

def plot_comparison(y_true, custom_pred, sklearn_pred):
"""
Plot comparison of predictions from both models against true values.
"""
plt.figure(figsize=(10, 6))
plt.scatter(y_true, custom_pred, alpha=0.7, label="Custom Model Predictions", color="blue")
plt.scatter(y_true, sklearn_pred, alpha=0.7, label="Sklearn Model Predictions", color="green")
plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r--', label="Ideal Fit")
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("Comparison of Custom vs Sklearn Model Predictions")
plt.legend()
plt.show()

if __name__ == "__main__":
# 1. Load Dataset
dataset_path = "data/boston_housing.csv"
X, y = load_dataset(dataset_path)

# 2. Preprocess Data
X, y, y_mean, y_std = preprocess_data(X, y)

# 3. Split Data into Training and Testing Sets
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# 4. Train Custom Gradient Boosting Model
print("Training Custom Gradient Boosting Model...")
custom_model = GradientBoostingTree(n_estimators=100, learning_rate=0.1, max_depth=2)
custom_model.fit(X_train, y_train)
print("Custom Model training complete!")

# 5. Train Sklearn Gradient Boosting Model
print("Training Sklearn Gradient Boosting Model...")
sklearn_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=2, random_state=42)
sklearn_model.fit(X_train, y_train)
print("Sklearn Model training complete!")

# 6. Make Predictions
custom_pred = custom_model.predict(X_test) * y_std + y_mean
sklearn_pred = sklearn_model.predict(X_test) * y_std + y_mean
y_test_original = y_test * y_std + y_mean

# 7. Evaluate Both Models
print("\nCustom Model Evaluation:")
custom_mse, custom_mae, custom_r2 = evaluate_model(y_test_original, custom_pred)
print(f"Mean Squared Error (MSE): {custom_mse:.4f}")
print(f"Mean Absolute Error (MAE): {custom_mae:.4f}")
print(f"R² Score: {custom_r2:.4f}")

print("\nSklearn Model Evaluation:")
sklearn_mse, sklearn_mae, sklearn_r2 = evaluate_model(y_test_original, sklearn_pred)
print(f"Mean Squared Error (MSE): {sklearn_mse:.4f}")
print(f"Mean Absolute Error (MAE): {sklearn_mae:.4f}")
print(f"R² Score: {sklearn_r2:.4f}")

# 8. Plot Results
print("\nPlotting Comparison of Results...")
plot_comparison(y_test_original, custom_pred, sklearn_pred)
150 changes: 150 additions & 0 deletions Comparison-modelSelection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import numpy as np
import pandas as pd
from boosting.gradient_boosting import GradientBoostingTree
from model_selection.cross_validation import k_fold_cv
from model_selection.bootstrapping import bootstrap
from sklearn.ensemble import GradientBoostingRegressor

def load_dataset(file_path):
"""
Load dataset from a CSV file.
"""
try:
data = pd.read_csv(file_path)
if data.empty:
raise ValueError("Dataset is empty. Please provide a valid dataset.")
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
return X, y
except Exception as e:
print(f"Error loading dataset: {e}")
exit()

def preprocess_data(X, y):
"""
Preprocess the dataset: scale features and return scaled X and y.
"""
X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0)
X_scaled = (X - X_mean) / X_std

y_mean = np.mean(y)
y_std = np.std(y)
y_scaled = (y - y_mean) / y_std

return X_scaled, y_scaled, y_mean, y_std

def grid_search_max_depth(X, y, max_depth_values, n_estimators=100, learning_rate=0.1, k=5):
"""
Perform grid search to find the best max_depth using k-fold cross-validation.
"""
best_score = float("inf")
best_max_depth = None

for max_depth in max_depth_values:
model = GradientBoostingTree(
n_estimators=n_estimators,
learning_rate=learning_rate,
max_depth=max_depth
)
score = k_fold_cv(model, X, y, k=k, metric="mse")
print(f"Max Depth: {max_depth}, CV MSE: {score:.4f}")
if score < best_score:
best_score = score
best_max_depth = max_depth

return best_max_depth, best_score

def evaluate_model(y_true, y_pred):
"""
Evaluate the model using MSE, MAE, and R² metrics.
"""
mse = np.mean((y_true - y_pred) ** 2)
mae = np.mean(np.abs(y_true - y_pred))
r2 = 1 - (np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2))
return mse, mae, r2

def plot_comparison(y_true, custom_pred, sklearn_pred):
"""
Plot comparison of predictions from both models against true values.
"""
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.scatter(y_true, custom_pred, alpha=0.7, label="Custom Model Predictions", color="blue")
plt.scatter(y_true, sklearn_pred, alpha=0.7, label="Sklearn Model Predictions", color="green")
plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r--', label="Ideal Fit")
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("Comparison of Custom vs Sklearn Model Predictions")
plt.legend()
plt.show()

if __name__ == "__main__":
# 1. Load and Preprocess Dataset
dataset_path = "data/highly_correlated_dataset.csv"
X, y = load_dataset(dataset_path)
X, y, y_mean, y_std = preprocess_data(X, y)

# 2. Perform K-Fold Cross-Validation for Custom Model
print("\nPerforming K-Fold Cross-Validation for Custom Model...")
custom_model = GradientBoostingTree(n_estimators=100, learning_rate=0.1, max_depth=3)
custom_cv_score = k_fold_cv(custom_model, X, y, k=5, metric="mse")
print(f"Custom Model K-Fold Cross-Validation MSE: {custom_cv_score:.4f}")

# 3. Perform K-Fold Cross-Validation for Sklearn Model
print("\nPerforming K-Fold Cross-Validation for Sklearn Model...")
sklearn_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
sklearn_cv_score = k_fold_cv(sklearn_model, X, y, k=5, metric="mse")
print(f"Sklearn Model K-Fold Cross-Validation MSE: {sklearn_cv_score:.4f}")

# 4. Perform Bootstrapping for Custom Model
print("\nPerforming Bootstrapping for Custom Model...")
custom_bootstrap_scores, custom_mean_bootstrap_score = bootstrap(custom_model, X, y, B=10, metric="mse")
print(f"Custom Model Bootstrap Mean MSE: {custom_mean_bootstrap_score:.4f}")
print(f"Custom Model Bootstrap Scores: {custom_bootstrap_scores}")

# 5. Perform Bootstrapping for Sklearn Model
print("\nPerforming Bootstrapping for Sklearn Model...")
sklearn_bootstrap_scores, sklearn_mean_bootstrap_score = bootstrap(sklearn_model, X, y, B=10, metric="mse")
print(f"Sklearn Model Bootstrap Mean MSE: {sklearn_mean_bootstrap_score:.4f}")
print(f"Sklearn Model Bootstrap Scores: {sklearn_bootstrap_scores}")

# 6. Perform Grid Search for Best Max Depth for Custom Model
print("\nPerforming Grid Search for Best Max Depth for Custom Model...")
max_depth_values = [2, 3, 5]
best_max_depth, best_cv_score = grid_search_max_depth(X, y, max_depth_values)
print(f"Best Max Depth: {best_max_depth}")
print(f"Best Custom Model CV MSE: {best_cv_score:.4f}")

# 7. Train Final Custom Model with Best Parameters
print("\nTraining Final Custom Model with Best Parameters...")
final_custom_model = GradientBoostingTree(
n_estimators=100, learning_rate=0.1, max_depth=best_max_depth
)
final_custom_model.fit(X, y)
print("Final Custom Model training complete!")

# 8. Train Final Sklearn Model
print("\nTraining Final Sklearn Model...")
final_sklearn_model = GradientBoostingRegressor(
n_estimators=100, learning_rate=0.1, max_depth=best_max_depth, random_state=42
)
final_sklearn_model.fit(X, y)
print("Final Sklearn Model training complete!")

# 9. Make Predictions for Final Models
custom_pred = final_custom_model.predict(X) * y_std + y_mean
sklearn_pred = final_sklearn_model.predict(X) * y_std + y_mean
y_original = y * y_std + y_mean

# 10. Evaluate Both Models
print("\nEvaluating Both Models...")
custom_mse, custom_mae, custom_r2 = evaluate_model(y_original, custom_pred)
print(f"Custom Model - MSE: {custom_mse:.4f}, MAE: {custom_mae:.4f}, R²: {custom_r2:.4f}")

sklearn_mse, sklearn_mae, sklearn_r2 = evaluate_model(y_original, sklearn_pred)
print(f"Sklearn Model - MSE: {sklearn_mse:.4f}, MAE: {sklearn_mae:.4f}, R²: {sklearn_r2:.4f}")

# 11. Plot Comparison of Both Models
print("\nPlotting Comparison of Both Models...")
plot_comparison(y_original, custom_pred, sklearn_pred)
96 changes: 96 additions & 0 deletions Model/Gradient_Boosting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import numpy as np
import pandas as pd

class GradientBoostingTree:
def __init__(self, n_estimators, learning_rate, max_depth):
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.max_depth = max_depth
self.models = []

def fit(self, X, y):
n_samples = X.shape[0]
y_pred = np.full(n_samples, np.mean(y))
self.initial_pred = np.mean(y)

for _ in range(self.n_estimators):
residuals = y - y_pred
tree = DecisionTreeRegressor(max_depth=self.max_depth)
tree.fit(X, residuals)
self.models.append(tree)
y_pred += self.learning_rate * tree.predict(X)

def predict(self, X):
y_pred = np.full(X.shape[0], self.initial_pred)
for tree in self.models:
y_pred += self.learning_rate * tree.predict(X)
return y_pred

class DecisionTreeRegressor:
def __init__(self, max_depth=3):
self.max_depth = max_depth
self.tree = None

def fit(self, X, y):
self.tree = self._build_tree(X, y, depth=0)

def predict(self, X):
return np.array([self._predict_single(x, self.tree) for x in X])

def _build_tree(self, X, y, depth):
if depth == self.max_depth or len(set(y)) == 1:
return np.mean(y)

best_split = self._find_best_split(X, y)
if best_split is None:
return np.mean(y)

feature_idx, threshold = best_split
left_mask = X[:, feature_idx] <= threshold
right_mask = ~left_mask

return {
"feature_idx": feature_idx,
"threshold": threshold,
"left": self._build_tree(X[left_mask], y[left_mask], depth + 1),
"right": self._build_tree(X[right_mask], y[right_mask], depth + 1),
}

def _find_best_split(self, X, y):
n_samples, n_features = X.shape
best_mse = float("inf")
best_split = None

for feature_idx in range(n_features):
thresholds = np.unique(X[:, feature_idx])
for threshold in thresholds:
left_mask = X[:, feature_idx] <= threshold
right_mask = ~left_mask
if left_mask.sum() == 0 or right_mask.sum() == 0:
continue

left_mse = self._compute_mse(y[left_mask])
right_mse = self._compute_mse(y[right_mask])
mse = (left_mask.sum() * left_mse + right_mask.sum() * right_mse) / n_samples

if mse < best_mse:
best_mse = mse
best_split = (feature_idx, threshold)
return best_split

def _compute_mse(self, y):
if len(y) == 0:
return 0
return np.mean((y - np.mean(y)) ** 2)

def _predict_single(self, x, tree):
if not isinstance(tree, dict):
return tree

feature_idx = tree["feature_idx"]
threshold = tree["threshold"]

if x[feature_idx] <= threshold:
return self._predict_single(x, tree["left"])
else:
return self._predict_single(x, tree["right"])
Loading