diff --git a/src/linearboost/linear_boost.py b/src/linearboost/linear_boost.py index e6fac67..64073ec 100644 --- a/src/linearboost/linear_boost.py +++ b/src/linearboost/linear_boost.py @@ -18,7 +18,7 @@ from numbers import Integral, Real if sys.version_info >= (3, 11): - from typing import Self + from typing import Self # pragma: no cover else: from typing_extensions import Self @@ -104,8 +104,7 @@ class LinearBoostClassifier(AdaBoostClassifier): - 'maxabs': Uses MaxAbsScaler. - 'robust': Applies RobustScaler. - class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, \ - default=None + class_weight : {"balanced"}, dict or list of dicts, default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. @@ -113,10 +112,6 @@ class LinearBoostClassifier(AdaBoostClassifier): weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` - The "balanced_subsample" mode is the same as "balanced" except that - weights are computed based on the bootstrap sample for every tree - grown. - Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. @@ -196,7 +191,7 @@ class LinearBoostClassifier(AdaBoostClassifier): "algorithm": [StrOptions({"SAMME", "SAMME.R"})], "scaler": [StrOptions({s for s in _scalers})], "class_weight": [ - StrOptions({"balanced_subsample", "balanced"}), + StrOptions({"balanced"}), dict, list, None, @@ -307,15 +302,9 @@ def fit(self, X, y, sample_weight=None) -> Self: self.n_classes_ = self.classes_.shape[0] if self.class_weight is not None: - valid_presets = ("balanced", "balanced_subsample") - if ( - isinstance(self.class_weight, str) - and self.class_weight not in valid_presets - ): + if isinstance(self.class_weight, str) and self.class_weight != "balanced": raise ValueError( - "Valid presets for class_weight include " - '"balanced" and "balanced_subsample".' - 'Given "%s".' % self.class_weight + f'Valid preset for class_weight is "balanced". Given "{self.class_weight}".' ) expanded_class_weight = compute_sample_weight(self.class_weight, y) diff --git a/src/linearboost/sefr.py b/src/linearboost/sefr.py index db6477f..60def0b 100644 --- a/src/linearboost/sefr.py +++ b/src/linearboost/sefr.py @@ -3,7 +3,7 @@ import sys if sys.version_info >= (3, 11): - from typing import Self + from typing import Self # pragma: no cover else: from typing_extensions import Self diff --git a/tests/test_linearboost.py b/tests/test_linearboost.py index c6e8e96..f01a23f 100644 --- a/tests/test_linearboost.py +++ b/tests/test_linearboost.py @@ -1,4 +1,12 @@ +from unittest.mock import patch + +import numpy as np +import pytest from sklearn.base import is_classifier +from sklearn.datasets import load_breast_cancer, make_classification +from sklearn.exceptions import NotFittedError +from sklearn.model_selection import train_test_split +from sklearn.utils._testing import assert_allclose, assert_array_equal from linearboost.linear_boost import LinearBoostClassifier @@ -6,11 +14,863 @@ def test_linear_boost_estimator(): - """ - Test whether `LinearBoostClassifier` adheres to scikit-learn conventions. - """ + """Test whether LinearBoostClassifier adheres to scikit-learn conventions.""" check_estimator( LinearBoostClassifier(), expected_failed_checks=get_expected_failed_tests(LinearBoostClassifier()), ) assert is_classifier(LinearBoostClassifier) + + +def test_init_default_parameters(): + """Test LinearBoostClassifier initialization with default parameters.""" + clf = LinearBoostClassifier() + assert clf.n_estimators == 200 + assert clf.learning_rate == 1.0 + assert clf.algorithm == "SAMME.R" + assert clf.scaler == "minmax" + assert clf.class_weight is None + assert clf.loss_function is None + + +@pytest.mark.parametrize("n_estimators", [10, 50, 100, 200]) +def test_init_n_estimators(n_estimators): + """Test initialization with different n_estimators.""" + clf = LinearBoostClassifier(n_estimators=n_estimators) + assert clf.n_estimators == n_estimators + + +@pytest.mark.parametrize("learning_rate", [0.1, 0.5, 1.0, 2.0]) +def test_init_learning_rate(learning_rate): + """Test initialization with different learning rates.""" + clf = LinearBoostClassifier(learning_rate=learning_rate) + assert clf.learning_rate == learning_rate + + +@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) +def test_init_algorithm(algorithm): + """Test initialization with different algorithms.""" + clf = LinearBoostClassifier(algorithm=algorithm) + assert clf.algorithm == algorithm + + +@pytest.mark.parametrize( + "scaler", + [ + "minmax", + "quantile-uniform", + "quantile-normal", + "normalizer-l1", + "normalizer-l2", + "normalizer-max", + "standard", + "power", + "maxabs", + "robust", + ], +) +def test_init_scaler(scaler): + """Test initialization with different scalers.""" + clf = LinearBoostClassifier(scaler=scaler) + assert clf.scaler == scaler + + +@pytest.mark.parametrize("class_weight", [None, "balanced", {0: 1.0, 1: 2.0}]) +def test_init_class_weight(class_weight): + """Test initialization with different class weights.""" + clf = LinearBoostClassifier(class_weight=class_weight) + assert clf.class_weight == class_weight + + +def test_init_custom_loss_function(): + """Test initialization with custom loss function.""" + + def custom_loss(y_true, y_pred, sample_weight=None): + return np.mean((y_true - y_pred) ** 2) + + clf = LinearBoostClassifier(loss_function=custom_loss) + assert clf.loss_function == custom_loss + + +@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) +@pytest.mark.parametrize("n_estimators", [10, 50]) +def test_fit_simple_data(algorithm, n_estimators): + """Test fitting on simple 2D data.""" + X = np.array([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]) + y = np.array([0, 0, 0, 1, 1, 1]) + + clf = LinearBoostClassifier(n_estimators=n_estimators, algorithm=algorithm) + clf.fit(X, y) + + # Check fitted attributes + assert hasattr(clf, "estimators_") + assert hasattr(clf, "estimator_weights_") + assert hasattr(clf, "estimator_errors_") + assert hasattr(clf, "classes_") + assert hasattr(clf, "n_classes_") + assert hasattr(clf, "n_features_in_") + assert hasattr(clf, "scaler_") + + # Check shapes and properties + assert len(clf.estimators_) <= n_estimators + assert len(clf.estimator_weights_) == len(clf.estimator_errors_) + assert np.count_nonzero(clf.estimator_weights_) == len(clf.estimators_) + assert clf.classes_.shape == (2,) + assert clf.n_classes_ == 2 + assert clf.n_features_in_ == X.shape[1] + + +def test_fit_with_sample_weight(): + """Test fitting with sample weights.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + n_informative=4, + random_state=42, + n_clusters_per_class=1, + ) + sample_weight = np.random.RandomState(42).rand(len(X)) + + clf = LinearBoostClassifier(n_estimators=10) + clf.fit(X, y, sample_weight=sample_weight) + + assert hasattr(clf, "estimators_") + assert len(clf.estimators_) <= 10 + + +@pytest.mark.parametrize("scaler", ["minmax", "standard", "robust"]) +def test_fit_different_scalers(scaler): + """Test fitting with different scalers.""" + X, y = make_classification( + n_samples=50, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + clf = LinearBoostClassifier(n_estimators=5, scaler=scaler) + clf.fit(X, y) + + assert hasattr(clf, "scaler_") + predictions = clf.predict(X) + assert len(predictions) == len(X) + + +@pytest.mark.parametrize("class_weight", ["balanced", {0: 1.0, 1: 2.0}]) +def test_fit_with_class_weight(class_weight): + """Test fitting with class weights.""" + # Create imbalanced dataset + X = np.random.RandomState(42).randn(100, 4) + y = np.concatenate([np.zeros(80), np.ones(20)]) # Imbalanced: 80/20 split + + clf = LinearBoostClassifier(n_estimators=10, class_weight=class_weight) + clf.fit(X, y) + + assert hasattr(clf, "estimators_") + predictions = clf.predict(X) + assert len(predictions) == len(X) + + +def test_fit_with_dict_class_weight(): + """Test fitting with dictionary class weights.""" + X, y = make_classification( + n_samples=50, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + class_weight = {0: 1.0, 1: 2.0} + + clf = LinearBoostClassifier(n_estimators=5, class_weight=class_weight) + clf.fit(X, y) + + assert hasattr(clf, "estimators_") + predictions = clf.predict(X) + assert len(predictions) == len(X) + + +@pytest.mark.parametrize("n_samples,n_features", [(50, 3), (100, 5)]) +def test_fit_various_sizes(n_samples, n_features): + """Test fitting on datasets of various sizes.""" + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + clf = LinearBoostClassifier(n_estimators=5) + clf.fit(X, y) + + assert clf.n_features_in_ == n_features + assert len(clf.estimators_) <= 5 + + +def test_predict_binary_classification(): + """Test prediction for binary classification.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=42 + ) + + clf = LinearBoostClassifier(n_estimators=10) + clf.fit(X_train, y_train) + predictions = clf.predict(X_test) + + assert predictions.shape == (X_test.shape[0],) + assert set(predictions).issubset(set(y_train)) + + +def test_predict_proba(): + """Test probability prediction.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + clf = LinearBoostClassifier(n_estimators=10) + clf.fit(X, y) + probas = clf.predict_proba(X) + + # Check shape + assert probas.shape == (X.shape[0], 2) + + # Check probabilities are valid + assert np.all(probas >= 0) + assert np.all(probas <= 1) + assert_allclose(probas.sum(axis=1), 1.0, rtol=1e-7) + + +def test_predict_log_proba(): + """Test log probability prediction.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + clf = LinearBoostClassifier(n_estimators=10) + clf.fit(X, y) + log_probas = clf.predict_log_proba(X) + + # Check shape + assert log_probas.shape == (X.shape[0], 2) + + # Check all values are <= 0 (log probabilities) + assert np.all(log_probas <= 0) + + # Compare with predict_proba + probas = clf.predict_proba(X) + expected_log_probas = np.log(probas) + assert_allclose(log_probas, expected_log_probas, rtol=1e-5) + + +@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) +def test_decision_function(algorithm): + """Test decision function.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + clf = LinearBoostClassifier(n_estimators=10, algorithm=algorithm) + clf.fit(X, y) + decision = clf.decision_function(X) + + assert decision.shape == (X.shape[0],) + + # Check consistency with predict + predictions = clf.predict(X) + expected_predictions = (decision > 0).astype(int) + + # Map to class labels + predicted_classes = clf.classes_[expected_predictions] + assert_array_equal(predictions, predicted_classes) + + +def test_score_method(): + """Test the score method.""" + X, y = load_breast_cancer(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=42 + ) + + clf = LinearBoostClassifier(n_estimators=20) + clf.fit(X_train, y_train) + score = clf.score(X_test, y_test) + + assert isinstance(score, float) + assert 0 <= score <= 1 + + +# Error handling tests +def test_predict_before_fit_raises_error(): + """Test that predicting before fitting raises NotFittedError.""" + X = np.array([[1, 2], [3, 4]]) + clf = LinearBoostClassifier() + + with pytest.raises(NotFittedError): + clf.predict(X) + + +def test_predict_proba_before_fit_raises_error(): + """Test that predict_proba before fitting raises NotFittedError.""" + X = np.array([[1, 2], [3, 4]]) + clf = LinearBoostClassifier() + + with pytest.raises(NotFittedError): + clf.predict_proba(X) + + +def test_decision_function_before_fit_raises_error(): + """Test that decision_function before fitting raises NotFittedError.""" + X = np.array([[1, 2], [3, 4]]) + clf = LinearBoostClassifier() + + with pytest.raises(NotFittedError): + clf.decision_function(X) + + +def test_single_class_error(): + """Test that fitting with single class raises ValueError.""" + X = np.array([[1, 2], [3, 4], [5, 6]]) + y = np.array([0, 0, 0]) # All same class + + clf = LinearBoostClassifier() + with pytest.raises( + ValueError, match="Classifier can't train when only one class is present" + ): + clf.fit(X, y) + + +def test_multiclass_error(): + """Test that multiclass data raises ValueError.""" + X = np.array([[1, 2], [3, 4], [5, 6]]) + y = np.array([0, 1, 2]) # Three classes + + clf = LinearBoostClassifier() + with pytest.raises( + ValueError, + match=r"Only binary classification is supported|Unknown label type: non-binary", + ): + clf.fit(X, y) + + +def test_multiclass_error_sklearn_pre_16(): + """Test multiclass error with sklearn < 1.6 message.""" + X = np.array([[1, 2], [3, 4], [5, 6]]) + y = np.array([0, 1, 2]) # Three classes + + clf = LinearBoostClassifier() + + # Mock SKLEARN_V1_6_OR_LATER to be False to test the older sklearn message + with patch("linearboost.linear_boost.SKLEARN_V1_6_OR_LATER", False): + with pytest.raises(ValueError, match="Unknown label type: non-binary"): + clf.fit(X, y) + + +def test_invalid_algorithm_error(): + """Test that invalid algorithm raises ValueError.""" + X = np.array([[1, 2], [3, 4]]) + y = np.array([0, 1]) + + clf = LinearBoostClassifier(algorithm="INVALID") + with pytest.raises(ValueError, match="algorithm must be 'SAMME' or 'SAMME.R'"): + clf.fit(X, y) + + +def test_invalid_scaler_error(): + """Test that invalid scaler raises ValueError.""" + X = np.array([[1, 2], [3, 4]]) + y = np.array([0, 1]) + + clf = LinearBoostClassifier(scaler="invalid_scaler") + with pytest.raises(ValueError, match="Invalid scaler provided"): + clf.fit(X, y) + + +def test_invalid_class_weight_error(): + """Test that invalid class weight string raises ValueError.""" + X = np.array([[1, 2], [3, 4]]) + y = np.array([0, 1]) + + clf = LinearBoostClassifier(class_weight="invalid_weight") + with pytest.raises(ValueError, match='Valid preset for class_weight is "balanced"'): + clf.fit(X, y) + + +def test_invalid_sample_weight_shape(): + """Test that invalid sample weight shape raises ValueError.""" + X = np.array([[1, 2], [3, 4]]) + y = np.array([0, 1]) + sample_weight = np.array([0.5]) # Wrong length + + clf = LinearBoostClassifier() + with pytest.raises(ValueError, match="sample_weight.shape"): + clf.fit(X, y, sample_weight=sample_weight) + + +def test_inconsistent_feature_numbers(): + """Test that inconsistent feature numbers raise ValueError.""" + X_train = np.array([[1, 2], [3, 4]]) + y_train = np.array([0, 1]) + X_test = np.array([[1, 2, 3]]) # Different number of features + + clf = LinearBoostClassifier(n_estimators=5) + clf.fit(X_train, y_train) + + with pytest.raises(ValueError): + clf.predict(X_test) + + +def test_empty_data_error(): + """Test that empty data raises appropriate error.""" + X = np.array([]).reshape(0, 2) + y = np.array([]) + + clf = LinearBoostClassifier() + with pytest.raises(ValueError): + clf.fit(X, y) + + +def test_zero_n_estimators_error(): + """Test that zero n_estimators raises ValueError.""" + X = np.array([[1, 2], [3, 4]]) + y = np.array([0, 1]) + with pytest.raises(ValueError): + LinearBoostClassifier(n_estimators=0).fit(X, y) + + +def test_negative_learning_rate_error(): + """Test that negative learning rate raises ValueError.""" + X = np.array([[1, 2], [3, 4]]) + y = np.array([0, 1]) + with pytest.raises(ValueError): + LinearBoostClassifier(learning_rate=-0.1).fit(X, y) + + +def test_same_algorithm_worse_than_random(): + """Test SAMME algorithm with base classifier worse than random.""" + # Create a dataset where a linear classifier will perform very poorly + # This is challenging to create reliably, so we'll use mocking + X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + y = np.array([0, 1, 0, 1]) + + clf = LinearBoostClassifier(n_estimators=2, algorithm="SAMME") + + # Mock the SEFR predict method to always return wrong predictions + original_predict = clf.estimator.predict + + def bad_predict(X): + # Always predict the opposite class to simulate worse than random + predictions = original_predict(X) + return 1 - predictions # Flip all predictions + + try: + with patch.object(clf.estimator, "predict", bad_predict): + clf.fit(X, y) + # If no error is raised, that's also fine - it means the algorithm handled it + except ValueError as e: + # This would cover the error case we want to test + assert ( + "BaseClassifier in AdaBoostClassifier ensemble is worse than random" + in str(e) + ) + + +# Algorithmic behavior tests +@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) +def test_boosting_improves_performance(algorithm): + """Test that boosting generally improves performance over single estimator.""" + X, y = make_classification( + n_samples=200, + n_features=10, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=42 + ) + + # Single estimator + clf_single = LinearBoostClassifier(n_estimators=1, algorithm=algorithm) + clf_single.fit(X_train, y_train) + score_single = clf_single.score(X_test, y_test) + + # Multiple estimators + clf_multiple = LinearBoostClassifier(n_estimators=20, algorithm=algorithm) + clf_multiple.fit(X_train, y_train) + score_multiple = clf_multiple.score(X_test, y_test) + + # Boosting should generally improve performance (allow some tolerance for variability) + assert ( + score_multiple >= score_single - 0.1 + ) # Allow small decrease due to randomness + + +def test_reproducibility_with_random_state(): + """Test that results are reproducible when using random_state.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + # Note: LinearBoostClassifier doesn't have random_state parameter, + # but underlying data generation should be reproducible + clf1 = LinearBoostClassifier(n_estimators=10) + clf1.fit(X, y) + pred1 = clf1.predict(X) + + clf2 = LinearBoostClassifier(n_estimators=10) + clf2.fit(X, y) + pred2 = clf2.predict(X) + + # Results should be reproducible with same data + assert_array_equal(pred1, pred2) + + +def test_early_stopping_behavior(): + """Test early stopping when perfect fit is achieved.""" + # Create very simple linearly separable data + X = np.array([[0, 0], [1, 1], [10, 10], [11, 11]]) + y = np.array([0, 0, 1, 1]) + + clf = LinearBoostClassifier(n_estimators=100) + clf.fit(X, y) + + # Should achieve perfect fit quickly and possibly stop early + assert len(clf.estimators_) <= 100 + accuracy = clf.score(X, y) + assert accuracy >= 0.9 # Should achieve high accuracy + + +def test_estimator_weights_properties(): + """Test properties of estimator weights.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + clf = LinearBoostClassifier(n_estimators=10) + clf.fit(X, y) + + # All weights should be non-negative (for SAMME.R) + assert np.count_nonzero(clf.estimator_weights_) == len(clf.estimators_) + assert np.all(np.array(clf.estimator_weights_) >= 0) + + +def test_estimator_errors_properties(): + """Test properties of estimator errors.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + clf = LinearBoostClassifier(n_estimators=10) + clf.fit(X, y) + + assert (clf.estimator_errors_ != 1).sum() == len(clf.estimators_) + errors = np.array(clf.estimator_errors_) + assert np.all(errors >= 0) + + +def test_scaler_attribute_properties(): + """Test properties of the scaler_ attribute.""" + X, y = make_classification( + n_samples=50, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + clf = LinearBoostClassifier(scaler="standard") + clf.fit(X, y) + + assert hasattr(clf, "scaler_") + assert hasattr(clf.scaler_, "transform") + assert hasattr(clf.scaler_, "fit_transform") + + +def test_feature_names_in(): + """Test feature_names_in_ attribute when using pandas DataFrame.""" + try: + import pandas as pd + + # Create DataFrame with named features + X, y = make_classification( + n_samples=50, + n_features=3, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + feature_names = ["feature1", "feature2", "feature3"] + X_df = pd.DataFrame(X, columns=feature_names) + + clf = LinearBoostClassifier(n_estimators=5) + clf.fit(X_df, y) + + assert hasattr(clf, "feature_names_in_") + assert_array_equal(clf.feature_names_in_, feature_names) + + except ImportError: + pytest.skip("pandas not available") + + +def test_classes_attribute(): + """Test the classes_ attribute after fitting.""" + X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + y = np.array([0, 1, 0, 1]) + + clf = LinearBoostClassifier(n_estimators=5) + clf.fit(X, y) + + assert hasattr(clf, "classes_") + assert_array_equal(clf.classes_, [0, 1]) + + +def test_classes_attribute_string_labels(): + """Test the classes_ attribute with string labels.""" + X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + y = np.array(["cat", "dog", "cat", "dog"]) + + clf = LinearBoostClassifier(n_estimators=5) + clf.fit(X, y) + + assert hasattr(clf, "classes_") + assert_array_equal(clf.classes_, ["cat", "dog"]) + + +def test_breast_cancer_dataset(): + """Test on real-world breast cancer dataset.""" + X, y = load_breast_cancer(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=42 + ) + + clf = LinearBoostClassifier(n_estimators=20) + clf.fit(X_train, y_train) + + # Check basic functionality + predictions = clf.predict(X_test) + probabilities = clf.predict_proba(X_test) + score = clf.score(X_test, y_test) + + assert len(predictions) == len(X_test) + assert probabilities.shape == (len(X_test), 2) + assert 0 <= score <= 1 + assert score > 0.5 # Should be better than random guessing + + +def test_memory_efficiency(): + """Test that LinearBoostClassifier doesn't consume excessive memory.""" + X, y = make_classification( + n_samples=200, + n_features=10, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + clf = LinearBoostClassifier(n_estimators=10) + clf.fit(X, y) + + # Check that the model doesn't store the training data + assert not hasattr(clf, "X_") + assert not hasattr(clf, "y_") + + # Check that estimators are SEFR instances (lightweight) + for estimator in clf.estimators_: + assert estimator.__class__.__name__ == "SEFR" + + +def test_different_class_labels(): + """Test with different types of class labels.""" + X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + + # Test with different label encodings + label_sets = [ + [0, 1, 0, 1], # Standard 0/1 + [1, 2, 1, 2], # 1/2 encoding + [-1, 1, -1, 1], # -1/1 encoding + ["A", "B", "A", "B"], # String labels + ] + + for y in label_sets: + y = np.array(y) + clf = LinearBoostClassifier(n_estimators=3) + clf.fit(X, y) + + predictions = clf.predict(X) + assert len(predictions) == len(X) + assert set(predictions).issubset(set(y)) + + +def test_numerical_stability(): + """Test numerical stability with extreme values.""" + # Test with very small values + X_small = np.array([[1e-10, 2e-10], [3e-10, 4e-10], [5e-10, 6e-10], [7e-10, 8e-10]]) + y = np.array([0, 0, 1, 1]) + + clf = LinearBoostClassifier(n_estimators=3) + clf.fit(X_small, y) + predictions = clf.predict(X_small) + + assert len(predictions) == len(X_small) + + # Test with large values + X_large = np.array([[1e6, 2e6], [3e6, 4e6], [5e6, 6e6], [7e6, 8e6]]) + + clf_large = LinearBoostClassifier(n_estimators=3) + clf_large.fit(X_large, y) + predictions_large = clf_large.predict(X_large) + + assert len(predictions_large) == len(X_large) + + +@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) +def test_algorithm_differences(algorithm): + """Test that different algorithms produce different results.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + clf = LinearBoostClassifier(n_estimators=10, algorithm=algorithm) + clf.fit(X, y) + + # Both algorithms should work + predictions = clf.predict(X) + probabilities = clf.predict_proba(X) + decision = clf.decision_function(X) + + assert len(predictions) == len(X) + assert probabilities.shape == (len(X), 2) + assert decision.shape == (len(X),) + + +def test_learning_rate_effect(): + """Test that learning rate affects the model behavior.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + # Very low learning rate + clf_low = LinearBoostClassifier(n_estimators=10, learning_rate=0.1) + clf_low.fit(X, y) + + # Higher learning rate + clf_high = LinearBoostClassifier(n_estimators=10, learning_rate=1.0) + clf_high.fit(X, y) + + # Both should work + pred_low = clf_low.predict(X) + pred_high = clf_high.predict(X) + + assert len(pred_low) == len(X) + assert len(pred_high) == len(X) + + +def test_n_estimators_effect(): + """Test that n_estimators affects the number of fitted estimators.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + for n_est in [1, 5, 10]: + clf = LinearBoostClassifier(n_estimators=n_est) + clf.fit(X, y) + + # Should fit at most n_est estimators (could be fewer due to early stopping) + assert len(clf.estimators_) <= n_est + assert len(clf.estimators_) >= 1 # Should fit at least one + + +def test_sample_weight_filtering_1d(): + """Test sample weight filtering with 1D sample weights.""" + X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + y = np.array([0, 1, 0, 1]) + # Some zero weights to trigger filtering + sample_weight = np.array([1.0, 0.0, 1.0, 0.5]) # 1D array with some zeros + + clf = LinearBoostClassifier(n_estimators=3) + clf.fit(X, y, sample_weight=sample_weight) + + # Should work and filter out zero-weight samples + assert hasattr(clf, "estimators_") + predictions = clf.predict(X) + assert len(predictions) == len(X) + + +def test_sample_weight_filtering_2d(): + """Test sample weight filtering with 2D sample weights.""" + # This test is more theoretical since sklearn doesn't allow 2D sample weights + # But we can test the code logic by mocking the internal behavior + X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + y = np.array([0, 1, 0, 1]) + + # Test the logic by directly calling the internal method with 2D weights + clf = LinearBoostClassifier(n_estimators=3) + + # Mock the sample weight processing to test the ndim > 1 branch + sample_weight_2d = np.array([[1.0], [0.0], [1.0], [0.5]]) + + # Test the condition logic directly + nonzero_mask = ( + sample_weight_2d.sum(axis=1) != 0 + if sample_weight_2d.ndim > 1 + else sample_weight_2d != 0 + ) + + # Verify the 2D logic works as expected + assert sample_weight_2d.ndim > 1 + expected_mask = np.array([True, False, True, True]) # [1.0], [0.0], [1.0], [0.5] + assert np.array_equal(nonzero_mask, expected_mask) + + # Now test with 1D weights (which sklearn accepts) + sample_weight_1d = sample_weight_2d.flatten() + clf.fit(X, y, sample_weight=sample_weight_1d) + + assert hasattr(clf, "estimators_") + predictions = clf.predict(X) + assert len(predictions) == len(X) diff --git a/tests/test_sefr.py b/tests/test_sefr.py index 5273346..ff3eaa8 100644 --- a/tests/test_sefr.py +++ b/tests/test_sefr.py @@ -1,4 +1,16 @@ +from unittest.mock import patch + +import numpy as np +import pytest from sklearn.base import is_classifier +from sklearn.datasets import load_breast_cancer, make_classification +from sklearn.exceptions import NotFittedError +from sklearn.model_selection import train_test_split +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + assert_array_equal, +) from linearboost.sefr import SEFR @@ -6,8 +18,540 @@ def test_sefr_estimator(): - """ - Test whether `SEFR` classifier adheres to scikit-learn conventions. - """ + """Test whether SEFR classifier adheres to scikit-learn conventions.""" check_estimator(SEFR(), expected_failed_checks=get_expected_failed_tests(SEFR())) assert is_classifier(SEFR) + + +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_init_parameters(fit_intercept): + """Test SEFR initialization with different parameters.""" + sefr = SEFR(fit_intercept=fit_intercept) + assert sefr.fit_intercept == fit_intercept + + +def test_init_default_parameters(): + """Test SEFR initialization with default parameters.""" + sefr = SEFR() + assert sefr.fit_intercept is True + + +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_fit_simple_data(fit_intercept): + """Test fitting on simple 2D data.""" + X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]]) + y = np.array([0, 0, 1, 1]) + + sefr = SEFR(fit_intercept=fit_intercept) + sefr.fit(X, y) + + # Check fitted attributes + assert hasattr(sefr, "coef_") + assert hasattr(sefr, "intercept_") + assert hasattr(sefr, "classes_") + assert hasattr(sefr, "n_features_in_") + + # Check shapes + assert sefr.coef_.shape == (1, X.shape[1]) + assert sefr.intercept_.shape == (1,) + assert sefr.classes_.shape == (2,) + assert sefr.n_features_in_ == X.shape[1] + + # Check intercept behavior + if not fit_intercept: + assert sefr.intercept_[0] == 0.0 + + +def test_fit_with_sample_weight(): + """Test fitting with sample weights.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + n_informative=4, + random_state=42, + n_clusters_per_class=1, + ) + sample_weight = np.random.RandomState(42).rand(len(X)) + + sefr = SEFR() + sefr.fit(X, y, sample_weight=sample_weight) + + assert hasattr(sefr, "coef_") + assert hasattr(sefr, "intercept_") + assert sefr.coef_.shape == (1, X.shape[1]) + + +@pytest.mark.parametrize("n_samples,n_features", [(50, 3), (100, 5), (200, 10)]) +def test_fit_various_sizes(n_samples, n_features): + """Test fitting on datasets of various sizes.""" + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + sefr = SEFR() + sefr.fit(X, y) + + assert sefr.coef_.shape == (1, n_features) + assert sefr.n_features_in_ == n_features + + +def test_predict_binary_classification(): + """Test prediction for binary classification.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=42 + ) + + sefr = SEFR() + sefr.fit(X_train, y_train) + predictions = sefr.predict(X_test) + + assert predictions.shape == (X_test.shape[0],) + assert set(predictions).issubset(set(y_train)) + + +def test_predict_proba(): + """Test probability prediction.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + sefr = SEFR() + sefr.fit(X, y) + probas = sefr.predict_proba(X) + + # Check shape + assert probas.shape == (X.shape[0], 2) + + # Check probabilities are valid + assert np.all(probas >= 0) + assert np.all(probas <= 1) + assert_allclose(probas.sum(axis=1), 1.0, rtol=1e-7) + + +def test_predict_log_proba(): + """Test log probability prediction.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + sefr = SEFR() + sefr.fit(X, y) + log_probas = sefr.predict_log_proba(X) + + # Check shape + assert log_probas.shape == (X.shape[0], 2) + + # Check all values are <= 0 (log probabilities) + assert np.all(log_probas <= 0) + + # Compare with predict_proba + probas = sefr.predict_proba(X) + expected_log_probas = np.log(probas) + assert_allclose(log_probas, expected_log_probas, rtol=1e-7) + + +def test_decision_function(): + """Test decision function.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + sefr = SEFR() + sefr.fit(X, y) + decision = sefr.decision_function(X) + + assert decision.shape == (X.shape[0],) + + # Check consistency with predict + predictions = sefr.predict(X) + expected_predictions = (decision > 0).astype(int) + + # Map to class labels + predicted_classes = sefr.classes_[expected_predictions] + assert_array_equal(predictions, predicted_classes) + + +def test_score_method(): + """Test the score method.""" + X, y = load_breast_cancer(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=42 + ) + + sefr = SEFR() + sefr.fit(X_train, y_train) + score = sefr.score(X_test, y_test) + + assert isinstance(score, float) + assert 0 <= score <= 1 + + +# Error handling tests +def test_predict_before_fit_raises_error(): + """Test that predicting before fitting raises NotFittedError.""" + X = np.array([[1, 2], [3, 4]]) + sefr = SEFR() + + with pytest.raises(NotFittedError): + sefr.predict(X) + + +def test_predict_proba_before_fit_raises_error(): + """Test that predict_proba before fitting raises NotFittedError.""" + X = np.array([[1, 2], [3, 4]]) + sefr = SEFR() + + with pytest.raises(NotFittedError): + sefr.predict_proba(X) + + +def test_decision_function_before_fit_raises_error(): + """Test that decision_function before fitting raises NotFittedError.""" + X = np.array([[1, 2], [3, 4]]) + sefr = SEFR() + + with pytest.raises(NotFittedError): + sefr.decision_function(X) + + +def test_single_class_error(): + """Test that fitting with single class raises ValueError.""" + X = np.array([[1, 2], [3, 4], [5, 6]]) + y = np.array([0, 0, 0]) # All same class + + sefr = SEFR() + with pytest.raises( + ValueError, match="Classifier can't train when only one class is present" + ): + sefr.fit(X, y) + + +def test_multiclass_error(): + """Test that multiclass data raises ValueError.""" + X = np.array([[1, 2], [3, 4], [5, 6]]) + y = np.array([0, 1, 2]) # Three classes + + sefr = SEFR() + with pytest.raises( + ValueError, + match=r"Only binary classification is supported|Unknown label type: non-binary", + ): + sefr.fit(X, y) + + +def test_multiclass_error_sklearn_pre_16(): + """Test multiclass error with sklearn < 1.6 message.""" + X = np.array([[1, 2], [3, 4], [5, 6]]) + y = np.array([0, 1, 2]) # Three classes + + sefr = SEFR() + + # Mock SKLEARN_V1_6_OR_LATER to be False to test the older sklearn message + with patch("linearboost.sefr.SKLEARN_V1_6_OR_LATER", False): + with pytest.raises(ValueError, match="Unknown label type: non-binary"): + sefr.fit(X, y) + + +def test_invalid_sample_weight_shape(): + """Test that invalid sample weight shape raises ValueError.""" + X = np.array([[1, 2], [3, 4]]) + y = np.array([0, 1]) + sample_weight = np.array([0.5]) # Wrong length + + sefr = SEFR() + with pytest.raises((ValueError, IndexError)): + sefr.fit(X, y, sample_weight=sample_weight) + + +def test_zero_sample_weights_error(): + """Test that zero sample weights for all samples of a class raises error.""" + X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + y = np.array([0, 0, 1, 1]) + sample_weight = np.array( + [0.0, 0.0, 1.0, 1.0] + ) # All class 0 samples have zero weight + + sefr = SEFR() + with pytest.raises(ValueError, match="SEFR requires 2 classes"): + sefr.fit(X, y, sample_weight=sample_weight) + + +def test_inconsistent_feature_numbers(): + """Test that inconsistent feature numbers raise ValueError.""" + X_train = np.array([[1, 2], [3, 4]]) + y_train = np.array([0, 1]) + X_test = np.array([[1, 2, 3]]) # Different number of features + + sefr = SEFR() + sefr.fit(X_train, y_train) + + with pytest.raises(ValueError): + sefr.predict(X_test) + + +def test_empty_data_error(): + """Test that empty data raises appropriate error.""" + X = np.array([]).reshape(0, 2) + y = np.array([]) + + sefr = SEFR() + with pytest.raises(ValueError): + sefr.fit(X, y) + + +# Mathematical properties tests +def test_linear_separable_perfect_classification(): + """Test that SEFR works well on linearly separable data.""" + # Create perfectly linearly separable data + np.random.seed(42) + X = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [2, 2], [2, 3], [3, 2], [3, 3]]) + y = np.array([0, 0, 0, 0, 1, 1, 1, 1]) + + sefr = SEFR() + sefr.fit(X, y) + predictions = sefr.predict(X) + + # Should achieve perfect or near-perfect accuracy on this simple dataset + accuracy = (predictions == y).mean() + assert accuracy >= 0.75 # At least 75% accuracy + + +def test_reproducibility(): + """Test that results are reproducible with same data.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + sefr1 = SEFR() + sefr1.fit(X, y) + pred1 = sefr1.predict(X) + + sefr2 = SEFR() + sefr2.fit(X, y) + pred2 = sefr2.predict(X) + + assert_array_equal(pred1, pred2) + assert_array_almost_equal(sefr1.coef_, sefr2.coef_) + + +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_coefficient_properties(fit_intercept): + """Test mathematical properties of fitted coefficients.""" + X, y = make_classification( + n_samples=100, + n_features=4, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + sefr = SEFR(fit_intercept=fit_intercept) + sefr.fit(X, y) + + # Coefficients should be finite + assert np.all(np.isfinite(sefr.coef_)) + + # Intercept behavior + if fit_intercept: + assert np.isfinite(sefr.intercept_[0]) + else: + assert sefr.intercept_[0] == 0.0 + + +def test_feature_names_in(): + """Test feature_names_in_ attribute when using pandas DataFrame.""" + try: + import pandas as pd + + # Create DataFrame with named features + X, y = make_classification( + n_samples=50, + n_features=3, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + feature_names = ["feature1", "feature2", "feature3"] + X_df = pd.DataFrame(X, columns=feature_names) + + sefr = SEFR() + sefr.fit(X_df, y) + + assert hasattr(sefr, "feature_names_in_") + assert_array_equal(sefr.feature_names_in_, feature_names) + + except ImportError: + pytest.skip("pandas not available") + + +def test_classes_attribute(): + """Test the classes_ attribute after fitting.""" + X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + y = np.array([0, 1, 0, 1]) + + sefr = SEFR() + sefr.fit(X, y) + + assert hasattr(sefr, "classes_") + assert_array_equal(sefr.classes_, [0, 1]) + + +def test_classes_attribute_string_labels(): + """Test the classes_ attribute with string labels.""" + X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + y = np.array(["cat", "dog", "cat", "dog"]) + + sefr = SEFR() + sefr.fit(X, y) + + assert hasattr(sefr, "classes_") + assert_array_equal(sefr.classes_, ["cat", "dog"]) + + +def test_breast_cancer_dataset(): + """Test on real-world breast cancer dataset.""" + X, y = load_breast_cancer(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=42 + ) + + sefr = SEFR() + sefr.fit(X_train, y_train) + + # Check basic functionality + predictions = sefr.predict(X_test) + probabilities = sefr.predict_proba(X_test) + score = sefr.score(X_test, y_test) + + assert len(predictions) == len(X_test) + assert probabilities.shape == (len(X_test), 2) + assert 0 <= score <= 1 + assert score > 0.5 # Should be better than random guessing + + +def test_memory_efficiency(): + """Test that SEFR doesn't consume excessive memory.""" + # This is a basic test - in practice you might want more sophisticated memory profiling + X, y = make_classification( + n_samples=1000, + n_features=20, + n_redundant=0, + random_state=42, + n_clusters_per_class=1, + ) + + sefr = SEFR() + sefr.fit(X, y) + + # Check that the model doesn't store the training data + assert not hasattr(sefr, "X_") + assert not hasattr(sefr, "y_") + + # Check that coefficients are reasonably sized + assert sefr.coef_.nbytes < 1000 # Should be small for 20 features + + +def test_different_class_labels(): + """Test with different types of class labels.""" + X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + + # Test with different label encodings + label_sets = [ + [0, 1, 0, 1], # Standard 0/1 + [1, 2, 1, 2], # 1/2 encoding + [-1, 1, -1, 1], # -1/1 encoding + ["A", "B", "A", "B"], # String labels + ] + + for y in label_sets: + y = np.array(y) + sefr = SEFR() + sefr.fit(X, y) + + predictions = sefr.predict(X) + assert len(predictions) == len(X) + assert set(predictions).issubset(set(y)) + + +def test_numerical_stability(): + """Test numerical stability with extreme values.""" + # Test with very small values + X_small = np.array([[1e-10, 2e-10], [3e-10, 4e-10], [5e-10, 6e-10], [7e-10, 8e-10]]) + y = np.array([0, 0, 1, 1]) + + sefr = SEFR() + sefr.fit(X_small, y) + predictions = sefr.predict(X_small) + + assert len(predictions) == len(X_small) + assert np.all(np.isfinite(sefr.coef_)) + + # Test with large values + X_large = np.array([[1e6, 2e6], [3e6, 4e6], [5e6, 6e6], [7e6, 8e6]]) + + sefr_large = SEFR() + sefr_large.fit(X_large, y) + predictions_large = sefr_large.predict(X_large) + + assert len(predictions_large) == len(X_large) + assert np.all(np.isfinite(sefr_large.coef_)) + + +def test_check_X_method(): + """Test the _check_X method directly.""" + X = np.array([[1, 2], [3, 4]]) + y = np.array([0, 1]) + + sefr = SEFR() + sefr.fit(X, y) + + # Test successful validation + X_valid = np.array([[5, 6], [7, 8]]) + result = sefr._check_X(X_valid) + assert result.shape == X_valid.shape + assert np.array_equal(result, X_valid) + + +def test_check_X_wrong_features(): + """Test _check_X method with wrong number of features.""" + X = np.array([[1, 2], [3, 4]]) + y = np.array([0, 1]) + + sefr = SEFR() + sefr.fit(X, y) + + # Mock validate_data to return X but skip sklearn's feature check, so we can test our custom feature count check + X_wrong = np.array([[1, 2, 3]]) # 3 features instead of 2 + with patch("linearboost.sefr.validate_data", return_value=X_wrong): + with pytest.raises( + ValueError, match="Expected input with 2 features, got 3 instead" + ): + sefr._check_X(X_wrong)