Fall2024CS584 · mishrasharanya · Nov 16, 2024 · Nov 20, 2024 · Nov 21, 2024 · Nov 22, 2024
diff --git a/Comparison-gbt.py b/Comparison-gbt.py
@@ -0,0 +1,104 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.ensemble import GradientBoostingRegressor
+from boosting.gradient_boosting import GradientBoostingTree
+
+def load_dataset(file_path):
+    """
+    Load a dataset from a CSV file.
+    """
+    try:
+        data = pd.read_csv(file_path)
+        if data.empty:
+            raise ValueError("Dataset is empty. Please provide a valid dataset.")
+        X = data.iloc[:, :-1].values
+        y = data.iloc[:, -1].values
+        return X, y
+    except Exception as e:
+        print(f"Error loading dataset: {e}")
+        exit()
+
+def preprocess_data(X, y):
+    """
+    Preprocess the dataset: scale features and return scaled X and y.
+    """
+    X_mean = np.mean(X, axis=0)
+    X_std = np.std(X, axis=0)
+    X_scaled = (X - X_mean) / X_std
+
+    y_mean = np.mean(y)
+    y_std = np.std(y)
+    y_scaled = (y - y_mean) / y_std
+
+    return X_scaled, y_scaled, y_mean, y_std
+
+def evaluate_model(y_true, y_pred):
+    """
+    Evaluate the model using MSE, MAE, and R² metrics.
+    """
+    mse = np.mean((y_true - y_pred) ** 2)
+    mae = np.mean(np.abs(y_true - y_pred))
+    r2 = 1 - (np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2))
+    return mse, mae, r2
+
+def plot_comparison(y_true, custom_pred, sklearn_pred):
+    """
+    Plot comparison of predictions from both models against true values.
+    """
+    plt.figure(figsize=(10, 6))
+    plt.scatter(y_true, custom_pred, alpha=0.7, label="Custom Model Predictions", color="blue")
+    plt.scatter(y_true, sklearn_pred, alpha=0.7, label="Sklearn Model Predictions", color="green")
+    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r--', label="Ideal Fit")
+    plt.xlabel("True Values")
+    plt.ylabel("Predicted Values")
+    plt.title("Comparison of Custom vs Sklearn Model Predictions")
+    plt.legend()
+    plt.show()
+
+if __name__ == "__main__":
+    # 1. Load Dataset
+    dataset_path = "data/boston_housing.csv"
+    X, y = load_dataset(dataset_path)
+
+    # 2. Preprocess Data
+    X, y, y_mean, y_std = preprocess_data(X, y)
+
+    # 3. Split Data into Training and Testing Sets
+    train_size = int(0.8 * len(X))
+    X_train, X_test = X[:train_size], X[train_size:]
+    y_train, y_test = y[:train_size], y[train_size:]
+
+    # 4. Train Custom Gradient Boosting Model
+    print("Training Custom Gradient Boosting Model...")
+    custom_model = GradientBoostingTree(n_estimators=100, learning_rate=0.1, max_depth=2)
+    custom_model.fit(X_train, y_train)
+    print("Custom Model training complete!")
+
+    # 5. Train Sklearn Gradient Boosting Model
+    print("Training Sklearn Gradient Boosting Model...")
+    sklearn_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=2, random_state=42)
+    sklearn_model.fit(X_train, y_train)
+    print("Sklearn Model training complete!")
+
+    # 6. Make Predictions
+    custom_pred = custom_model.predict(X_test) * y_std + y_mean
+    sklearn_pred = sklearn_model.predict(X_test) * y_std + y_mean
+    y_test_original = y_test * y_std + y_mean
+
+    # 7. Evaluate Both Models
+    print("\nCustom Model Evaluation:")
+    custom_mse, custom_mae, custom_r2 = evaluate_model(y_test_original, custom_pred)
+    print(f"Mean Squared Error (MSE): {custom_mse:.4f}")
+    print(f"Mean Absolute Error (MAE): {custom_mae:.4f}")
+    print(f"R² Score: {custom_r2:.4f}")
+
+    print("\nSklearn Model Evaluation:")
+    sklearn_mse, sklearn_mae, sklearn_r2 = evaluate_model(y_test_original, sklearn_pred)
+    print(f"Mean Squared Error (MSE): {sklearn_mse:.4f}")
+    print(f"Mean Absolute Error (MAE): {sklearn_mae:.4f}")
+    print(f"R² Score: {sklearn_r2:.4f}")
+
+    # 8. Plot Results
+    print("\nPlotting Comparison of Results...")
+    plot_comparison(y_test_original, custom_pred, sklearn_pred)
diff --git a/Comparison-modelSelection.py b/Comparison-modelSelection.py
@@ -0,0 +1,150 @@
+import numpy as np
+import pandas as pd
+from boosting.gradient_boosting import GradientBoostingTree
+from model_selection.cross_validation import k_fold_cv
+from model_selection.bootstrapping import bootstrap
+from sklearn.ensemble import GradientBoostingRegressor
+
+def load_dataset(file_path):
+    """
+    Load dataset from a CSV file.
+    """
+    try:
+        data = pd.read_csv(file_path)
+        if data.empty:
+            raise ValueError("Dataset is empty. Please provide a valid dataset.")
+        X = data.iloc[:, :-1].values
+        y = data.iloc[:, -1].values
+        return X, y
+    except Exception as e:
+        print(f"Error loading dataset: {e}")
+        exit()
+
+def preprocess_data(X, y):
+    """
+    Preprocess the dataset: scale features and return scaled X and y.
+    """
+    X_mean = np.mean(X, axis=0)
+    X_std = np.std(X, axis=0)
+    X_scaled = (X - X_mean) / X_std
+
+    y_mean = np.mean(y)
+    y_std = np.std(y)
+    y_scaled = (y - y_mean) / y_std
+
+    return X_scaled, y_scaled, y_mean, y_std
+
+def grid_search_max_depth(X, y, max_depth_values, n_estimators=100, learning_rate=0.1, k=5):
+    """
+    Perform grid search to find the best max_depth using k-fold cross-validation.
+    """
+    best_score = float("inf")
+    best_max_depth = None
+
+    for max_depth in max_depth_values:
+        model = GradientBoostingTree(
+            n_estimators=n_estimators,
+            learning_rate=learning_rate,
+            max_depth=max_depth
+        )
+        score = k_fold_cv(model, X, y, k=k, metric="mse")
+        print(f"Max Depth: {max_depth}, CV MSE: {score:.4f}")
+        if score < best_score:
+            best_score = score
+            best_max_depth = max_depth
+
+    return best_max_depth, best_score
+
+def evaluate_model(y_true, y_pred):
+    """
+    Evaluate the model using MSE, MAE, and R² metrics.
+    """
+    mse = np.mean((y_true - y_pred) ** 2)
+    mae = np.mean(np.abs(y_true - y_pred))
+    r2 = 1 - (np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2))
+    return mse, mae, r2
+
+def plot_comparison(y_true, custom_pred, sklearn_pred):
+    """
+    Plot comparison of predictions from both models against true values.
+    """
+    import matplotlib.pyplot as plt
+    plt.figure(figsize=(10, 6))
+    plt.scatter(y_true, custom_pred, alpha=0.7, label="Custom Model Predictions", color="blue")
+    plt.scatter(y_true, sklearn_pred, alpha=0.7, label="Sklearn Model Predictions", color="green")
+    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r--', label="Ideal Fit")
+    plt.xlabel("True Values")
+    plt.ylabel("Predicted Values")
+    plt.title("Comparison of Custom vs Sklearn Model Predictions")
+    plt.legend()
+    plt.show()
+
+if __name__ == "__main__":
+    # 1. Load and Preprocess Dataset
+    dataset_path = "data/highly_correlated_dataset.csv"
+    X, y = load_dataset(dataset_path)
+    X, y, y_mean, y_std = preprocess_data(X, y)
+
+    # 2. Perform K-Fold Cross-Validation for Custom Model
+    print("\nPerforming K-Fold Cross-Validation for Custom Model...")
+    custom_model = GradientBoostingTree(n_estimators=100, learning_rate=0.1, max_depth=3)
+    custom_cv_score = k_fold_cv(custom_model, X, y, k=5, metric="mse")
+    print(f"Custom Model K-Fold Cross-Validation MSE: {custom_cv_score:.4f}")
+
+    # 3. Perform K-Fold Cross-Validation for Sklearn Model
+    print("\nPerforming K-Fold Cross-Validation for Sklearn Model...")
+    sklearn_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
+    sklearn_cv_score = k_fold_cv(sklearn_model, X, y, k=5, metric="mse")
+    print(f"Sklearn Model K-Fold Cross-Validation MSE: {sklearn_cv_score:.4f}")
+
+    # 4. Perform Bootstrapping for Custom Model
+    print("\nPerforming Bootstrapping for Custom Model...")
+    custom_bootstrap_scores, custom_mean_bootstrap_score = bootstrap(custom_model, X, y, B=10, metric="mse")
+    print(f"Custom Model Bootstrap Mean MSE: {custom_mean_bootstrap_score:.4f}")
+    print(f"Custom Model Bootstrap Scores: {custom_bootstrap_scores}")
+
+    # 5. Perform Bootstrapping for Sklearn Model
+    print("\nPerforming Bootstrapping for Sklearn Model...")
+    sklearn_bootstrap_scores, sklearn_mean_bootstrap_score = bootstrap(sklearn_model, X, y, B=10, metric="mse")
+    print(f"Sklearn Model Bootstrap Mean MSE: {sklearn_mean_bootstrap_score:.4f}")
+    print(f"Sklearn Model Bootstrap Scores: {sklearn_bootstrap_scores}")
+
+    # 6. Perform Grid Search for Best Max Depth for Custom Model
+    print("\nPerforming Grid Search for Best Max Depth for Custom Model...")
+    max_depth_values = [2, 3, 5]
+    best_max_depth, best_cv_score = grid_search_max_depth(X, y, max_depth_values)
+    print(f"Best Max Depth: {best_max_depth}")
+    print(f"Best Custom Model CV MSE: {best_cv_score:.4f}")
+
+    # 7. Train Final Custom Model with Best Parameters
+    print("\nTraining Final Custom Model with Best Parameters...")
+    final_custom_model = GradientBoostingTree(
+        n_estimators=100, learning_rate=0.1, max_depth=best_max_depth
+    )
+    final_custom_model.fit(X, y)
+    print("Final Custom Model training complete!")
+
+    # 8. Train Final Sklearn Model
+    print("\nTraining Final Sklearn Model...")
+    final_sklearn_model = GradientBoostingRegressor(
+        n_estimators=100, learning_rate=0.1, max_depth=best_max_depth, random_state=42
+    )
+    final_sklearn_model.fit(X, y)
+    print("Final Sklearn Model training complete!")
+
+    # 9. Make Predictions for Final Models
+    custom_pred = final_custom_model.predict(X) * y_std + y_mean
+    sklearn_pred = final_sklearn_model.predict(X) * y_std + y_mean
+    y_original = y * y_std + y_mean
+
+    # 10. Evaluate Both Models
+    print("\nEvaluating Both Models...")
+    custom_mse, custom_mae, custom_r2 = evaluate_model(y_original, custom_pred)
+    print(f"Custom Model - MSE: {custom_mse:.4f}, MAE: {custom_mae:.4f}, R²: {custom_r2:.4f}")
+
+    sklearn_mse, sklearn_mae, sklearn_r2 = evaluate_model(y_original, sklearn_pred)
+    print(f"Sklearn Model - MSE: {sklearn_mse:.4f}, MAE: {sklearn_mae:.4f}, R²: {sklearn_r2:.4f}")
+
+    # 11. Plot Comparison of Both Models
+    print("\nPlotting Comparison of Both Models...")
+    plot_comparison(y_original, custom_pred, sklearn_pred)
diff --git a/Model/Gradient_Boosting.py b/Model/Gradient_Boosting.py
@@ -0,0 +1,96 @@
+import numpy as np
+import pandas as pd
+
+class GradientBoostingTree:
+    def __init__(self, n_estimators, learning_rate, max_depth):
+        self.n_estimators = n_estimators
+        self.learning_rate = learning_rate
+        self.max_depth = max_depth
+        self.models = []
+
+    def fit(self, X, y):
+        n_samples = X.shape[0]
+        y_pred = np.full(n_samples, np.mean(y))
+        self.initial_pred = np.mean(y)
+
+        for _ in range(self.n_estimators):
+            residuals = y - y_pred
+            tree = DecisionTreeRegressor(max_depth=self.max_depth)
+            tree.fit(X, residuals)
+            self.models.append(tree)
+            y_pred += self.learning_rate * tree.predict(X)
+
+    def predict(self, X):
+        y_pred = np.full(X.shape[0], self.initial_pred)
+        for tree in self.models:
+            y_pred += self.learning_rate * tree.predict(X)
+        return y_pred
+
+class DecisionTreeRegressor:
+    def __init__(self, max_depth=3):
+        self.max_depth = max_depth
+        self.tree = None
+
+    def fit(self, X, y):
+        self.tree = self._build_tree(X, y, depth=0)
+
+    def predict(self, X):
+        return np.array([self._predict_single(x, self.tree) for x in X])
+
+    def _build_tree(self, X, y, depth):
+        if depth == self.max_depth or len(set(y)) == 1:
+            return np.mean(y)
+
+        best_split = self._find_best_split(X, y)
+        if best_split is None:
+            return np.mean(y)
+
+        feature_idx, threshold = best_split
+        left_mask = X[:, feature_idx] <= threshold
+        right_mask = ~left_mask
+
+        return {
+            "feature_idx": feature_idx,
+            "threshold": threshold,
+            "left": self._build_tree(X[left_mask], y[left_mask], depth + 1),
+            "right": self._build_tree(X[right_mask], y[right_mask], depth + 1),
+        }
+
+    def _find_best_split(self, X, y):
+        n_samples, n_features = X.shape
+        best_mse = float("inf")
+        best_split = None
+
+        for feature_idx in range(n_features):
+            thresholds = np.unique(X[:, feature_idx])
+            for threshold in thresholds:
+                left_mask = X[:, feature_idx] <= threshold
+                right_mask = ~left_mask
+                if left_mask.sum() == 0 or right_mask.sum() == 0:
+                    continue
+
+                left_mse = self._compute_mse(y[left_mask])
+                right_mse = self._compute_mse(y[right_mask])
+                mse = (left_mask.sum() * left_mse + right_mask.sum() * right_mse) / n_samples
+
+                if mse < best_mse:
+                    best_mse = mse
+                    best_split = (feature_idx, threshold)
+        return best_split
+
+    def _compute_mse(self, y):
+        if len(y) == 0:
+            return 0
+        return np.mean((y - np.mean(y)) ** 2)
+
+    def _predict_single(self, x, tree):
+        if not isinstance(tree, dict):
+            return tree
+
+        feature_idx = tree["feature_idx"]
+        threshold = tree["threshold"]
+
+        if x[feature_idx] <= threshold:
+            return self._predict_single(x, tree["left"])
+        else:
+            return self._predict_single(x, tree["right"])