From f407976ba2065339393ad26180f263905bd6d6b3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 29 Sep 2021 14:46:03 +0200 Subject: [PATCH] MNT Compatibility with sklearn 1.0 (#864) --- .pep8speaks.yml | 10 ----- .pre-commit-config.yaml | 13 +++---- azure-pipelines.yml | 31 +++++++++------ build_tools/azure/install.sh | 7 +++- doc/install.rst | 2 +- doc/sphinxext/sphinx_issues.py | 10 ++++- doc/whats_new/v0.8.rst | 13 +++++++ imblearn/base.py | 10 ++++- imblearn/ensemble/_forest.py | 14 +++++-- imblearn/ensemble/tests/test_easy_ensemble.py | 38 +++++++++++++++---- imblearn/ensemble/tests/test_forest.py | 18 +++++++-- imblearn/keras/tests/test_generator.py | 22 ++++++----- imblearn/metrics/_classification.py | 32 ++++++++-------- imblearn/metrics/tests/test_classification.py | 20 +++++++--- imblearn/metrics/tests/test_score_objects.py | 16 ++++++-- imblearn/over_sampling/_smote/cluster.py | 3 -- .../_smote/tests/test_kmeans_smote.py | 24 ++++++------ imblearn/over_sampling/tests/test_adasyn.py | 5 ++- .../tests/test_random_over_sampler.py | 9 ++++- imblearn/tests/test_base.py | 5 ++- imblearn/tests/test_common.py | 8 +++- imblearn/tests/test_pipeline.py | 33 +++++++++------- .../tests/test_cluster_centroids.py | 4 +- .../tests/test_edited_nearest_neighbours.py | 5 ++- .../tests/test_instance_hardness_threshold.py | 8 +++- .../tests/test_nearmiss.py | 4 +- .../tests/test_random_under_sampler.py | 4 +- ...test_repeated_edited_nearest_neighbours.py | 3 +- imblearn/utils/estimator_checks.py | 2 +- imblearn/utils/tests/test_validation.py | 12 +++++- 30 files changed, 252 insertions(+), 133 deletions(-) delete mode 100644 .pep8speaks.yml diff --git a/.pep8speaks.yml b/.pep8speaks.yml deleted file mode 100644 index 6a8eef8ce..000000000 --- a/.pep8speaks.yml +++ /dev/null @@ -1,10 +0,0 @@ -# File : .pep8speaks.yml - -scanner: - diff_only: False # If True, errors caused by only the patch are shown - linter: flake8 - -flake8: - max-line-length: 88 # Default is 79 in PEP 8 - ignore: # Errors and warnings to ignore - - E203 \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a8d252262..693b8c29e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,19 +1,16 @@ repos: -- repo: https://github.com/python/black - rev: stable - hooks: - - id: black - repo: https://github.com/pre-commit/pre-commit-hooks rev: v2.3.0 hooks: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace +- repo: https://github.com/psf/black + rev: 21.6b0 + hooks: + - id: black - repo: https://gitlab.com/pycqa/flake8 - rev: 3.7.8 + rev: 3.9.2 hooks: - id: flake8 types: [file, python] - # only check for unused imports for now, as long as - # the code is not fully PEP8 compatible - args: [--select=F401] diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e5b58374f..810100c39 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -3,23 +3,26 @@ jobs: - job: linting displayName: Linting pool: - vmImage: ubuntu-18.04 + vmImage: ubuntu-20.04 steps: - - bash: echo "##vso[task.prependpath]$CONDA/bin" - displayName: Add conda to PATH - - bash: sudo chown -R $USER $CONDA - displayName: Take ownership of conda installation - - bash: conda create --name flake8_env --yes flake8 - displayName: Install flake8 + - task: UsePythonVersion@0 + inputs: + versionSpec: '3.9' + - bash: | + # Include pytest compatibility with mypy + pip install pytest flake8 mypy==0.782 black==21.6b0 + displayName: Install linters + - bash: | + black --check . + displayName: Run black - bash: | - source activate flake8_env ./build_tools/circle/linting.sh displayName: Run linting - template: build_tools/azure/posix.yml parameters: name: Linux_Runs - vmImage: ubuntu-18.04 + vmImage: ubuntu-20.04 matrix: pylatest_pip_openblas_pandas: DISTRIB: 'conda-pip-latest' @@ -33,7 +36,7 @@ jobs: - template: build_tools/azure/posix.yml parameters: name: Linux - vmImage: ubuntu-18.04 + vmImage: ubuntu-20.04 dependsOn: [linting] matrix: # Linux environment to test that scikit-learn can be built against @@ -41,7 +44,6 @@ jobs: # i.e. numpy 1.13.3 and scipy 0.19 py36_ubuntu_atlas: DISTRIB: 'ubuntu' - PYTHON_VERSION: '3.6' JOBLIB_VERSION: '*' pylatest_conda_pandas_keras: DISTRIB: 'conda' @@ -61,11 +63,16 @@ jobs: TENSORFLOW_VERSION: '*' COVERAGE: 'true' TEST_DOCSTRINGS: 'true' + sklearn_0_24_conda: + DISTRIB: 'conda' + PYTHON_VERSION: '3.7' + SKLEARN_VERSION: '0.24.2' + INSTALL_MKL: 'true' - template: build_tools/azure/posix-32.yml parameters: name: Linux32 - vmImage: ubuntu-18.04 + vmImage: ubuntu-20.04 dependsOn: [linting] matrix: py36_ubuntu_atlas_32bit: diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 79c5d5814..65764090e 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -32,10 +32,15 @@ if [[ "$DISTRIB" == "conda" ]]; then fi make_conda $TO_INSTALL - python -m pip install scikit-learn TO_INSTALL="" + if [[ -n "$SKLEARN_VERSION" ]]; then + TO_INSTALL="$TO_INSTALL scikit-learn=$SKLEARN_VERSION" + else + TO_INSTALL="$TO_INSTALL scikit-learn" + fi + if [[ -n "$PANDAS_VERSION" ]]; then TO_INSTALL="$TO_INSTALL pandas=$PANDAS_VERSION" fi diff --git a/doc/install.rst b/doc/install.rst index 9b5fbde5c..490fbb733 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -12,7 +12,7 @@ The imbalanced-learn package requires the following dependencies: * python (>=3.6) * numpy (>=1.13.3) * scipy (>=0.19.1) -* scikit-learn (>=0.23) +* scikit-learn (>=0.24) * keras 2 (optional) * tensorflow (optional) diff --git a/doc/sphinxext/sphinx_issues.py b/doc/sphinxext/sphinx_issues.py index 81b31670c..9ad5941c2 100644 --- a/doc/sphinxext/sphinx_issues.py +++ b/doc/sphinxext/sphinx_issues.py @@ -80,7 +80,11 @@ class IssueRole(object): EXTERNAL_REPO_REGEX = re.compile(r"^(\w+)/(.+)([#@])([\w]+)$") def __init__( - self, uri_config_option, format_kwarg, github_uri_template, format_text=None, + self, + uri_config_option, + format_kwarg, + github_uri_template, + format_text=None, ): self.uri_config_option = uri_config_option self.format_kwarg = format_kwarg @@ -103,7 +107,9 @@ def make_node(self, name, issue_no, config, options=None): ) path = name_map.get(name) ref = "https://github.com/{issues_github_path}/{path}/{n}".format( - issues_github_path="{}/{}".format(username, repo), path=path, n=issue, + issues_github_path="{}/{}".format(username, repo), + path=path, + n=issue, ) formatted_issue = self.format_text(issue).lstrip("#") text = "{username}/{repo}{symbol}{formatted_issue}".format(**locals()) diff --git a/doc/whats_new/v0.8.rst b/doc/whats_new/v0.8.rst index 58b9e11b9..38661e52e 100644 --- a/doc/whats_new/v0.8.rst +++ b/doc/whats_new/v0.8.rst @@ -1,5 +1,18 @@ .. _changes_0_8: +Version 0.8.1 +============= + +**In development** + +Changelog + +Maintenance +........... + +- Make `imbalanced-learn` compatible with `scikit-learn` 1.0. + :pr:`864` by :user:`Guillaume Lemaitre `. + Version 0.8.0 ============= diff --git a/imblearn/base.py b/imblearn/base.py index fae05b0d9..e35288af1 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -82,7 +82,9 @@ def fit_resample(self, X, y): output = self._fit_resample(X, y) - y_ = label_binarize(output[1], np.unique(y)) if binarize_y else output[1] + y_ = ( + label_binarize(output[1], classes=np.unique(y)) if binarize_y else output[1] + ) X_, y_ = arrays_transformer.transform(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) @@ -284,7 +286,11 @@ def fit_resample(self, X, y): if self.validate: - y_ = label_binarize(output[1], np.unique(y)) if binarize_y else output[1] + y_ = ( + label_binarize(output[1], classes=np.unique(y)) + if binarize_y + else output[1] + ) X_, y_ = arrays_transformer.transform(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py index 9a0f05d2a..7346e74d2 100644 --- a/imblearn/ensemble/_forest.py +++ b/imblearn/ensemble/_forest.py @@ -422,15 +422,13 @@ def fit(self, X, y, sample_weight=None): ) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) + self._n_features = X.shape[1] if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() - # Remap output - _, self.n_features_ = X.shape - y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warn( @@ -627,5 +625,13 @@ def _set_oob_score(self, X, y): self.oob_score_ = oob_score / self.n_outputs_ + @property + def n_features_(self): + """Number of features when fitting the estimator.""" + return getattr(self.n_features_in_, self._n_features) + def _more_tags(self): - return {"multioutput": False} + return { + "multioutput": False, + "multilabel": False, + } diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py index 6bbd7f00d..fb118a92f 100644 --- a/imblearn/ensemble/tests/test_easy_ensemble.py +++ b/imblearn/ensemble/tests/test_easy_ensemble.py @@ -48,7 +48,10 @@ def test_easy_ensemble_classifier(n_estimators, base_estimator): # Check classification for various parameter settings. X, y = make_imbalance( - iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, + iris.data, + iris.target, + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) @@ -72,7 +75,10 @@ def test_easy_ensemble_classifier(n_estimators, base_estimator): def test_base_estimator(): # Check base_estimator and its default values. X, y = make_imbalance( - iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, + iris.data, + iris.target, + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) @@ -91,7 +97,10 @@ def test_base_estimator(): def test_bagging_with_pipeline(): X, y = make_imbalance( - iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, + iris.data, + iris.target, + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, ) estimator = EasyEnsembleClassifier( n_estimators=2, @@ -109,7 +118,9 @@ def test_warm_start(random_state=42): for n_estimators in [5, 10]: if clf_ws is None: clf_ws = EasyEnsembleClassifier( - n_estimators=n_estimators, random_state=random_state, warm_start=True, + n_estimators=n_estimators, + random_state=random_state, + warm_start=True, ) else: clf_ws.set_params(n_estimators=n_estimators) @@ -182,7 +193,10 @@ def test_warm_start_equivalence(): ) def test_easy_ensemble_classifier_error(n_estimators, msg_error): X, y = make_imbalance( - iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, + iris.data, + iris.target, + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, ) with pytest.raises(ValueError, match=msg_error): eec = EasyEnsembleClassifier(n_estimators=n_estimators) @@ -191,7 +205,10 @@ def test_easy_ensemble_classifier_error(n_estimators, msg_error): def test_easy_ensemble_classifier_single_estimator(): X, y = make_imbalance( - iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, + iris.data, + iris.target, + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) @@ -205,7 +222,10 @@ def test_easy_ensemble_classifier_single_estimator(): def test_easy_ensemble_classifier_grid_search(): X, y = make_imbalance( - iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, + iris.data, + iris.target, + sampling_strategy={0: 20, 1: 25, 2: 50}, + random_state=0, ) parameters = { @@ -213,6 +233,8 @@ def test_easy_ensemble_classifier_grid_search(): "base_estimator__n_estimators": [3, 4], } grid_search = GridSearchCV( - EasyEnsembleClassifier(base_estimator=AdaBoostClassifier()), parameters, cv=5, + EasyEnsembleClassifier(base_estimator=AdaBoostClassifier()), + parameters, + cv=5, ) grid_search.fit(X, y) diff --git a/imblearn/ensemble/tests/test_forest.py b/imblearn/ensemble/tests/test_forest.py index b78cf417d..ad1420631 100644 --- a/imblearn/ensemble/tests/test_forest.py +++ b/imblearn/ensemble/tests/test_forest.py @@ -32,7 +32,10 @@ def imbalanced_dataset(): [ ({"n_estimators": "whatever"}, "n_estimators must be an integer"), ({"n_estimators": -100}, "n_estimators must be greater than zero"), - ({"bootstrap": False, "oob_score": True}, "Out of bag estimation only",), + ( + {"bootstrap": False, "oob_score": True}, + "Out of bag estimation only", + ), ], ) def test_balanced_random_forest_error(imbalanced_dataset, forest_params, err_msg): @@ -105,7 +108,10 @@ def test_balanced_random_forest_oob(imbalanced_dataset): X, y, random_state=42, stratify=y ) est = BalancedRandomForestClassifier( - oob_score=True, random_state=0, n_estimators=1000, min_samples_leaf=2, + oob_score=True, + random_state=0, + n_estimators=1000, + min_samples_leaf=2, ) est.fit(X_train, y_train) @@ -135,12 +141,16 @@ def test_little_tree_with_small_max_samples(): # First fit with no restriction on max samples est1 = BalancedRandomForestClassifier( - n_estimators=1, random_state=rng, max_samples=None, + n_estimators=1, + random_state=rng, + max_samples=None, ) # Second fit with max samples restricted to just 2 est2 = BalancedRandomForestClassifier( - n_estimators=1, random_state=rng, max_samples=2, + n_estimators=1, + random_state=rng, + max_samples=2, ) est1.fit(X, y) diff --git a/imblearn/keras/tests/test_generator.py b/imblearn/keras/tests/test_generator.py index 841cbe564..40c10b6a3 100644 --- a/imblearn/keras/tests/test_generator.py +++ b/imblearn/keras/tests/test_generator.py @@ -6,17 +6,17 @@ from sklearn.datasets import load_iris keras = pytest.importorskip("keras") -from keras.models import Sequential -from keras.layers import Dense -from keras.utils import to_categorical +from keras.models import Sequential # noqa: E402 +from keras.layers import Dense # noqa: E402 +from keras.utils import to_categorical # noqa: E402 -from imblearn.datasets import make_imbalance -from imblearn.under_sampling import ClusterCentroids -from imblearn.under_sampling import NearMiss -from imblearn.over_sampling import RandomOverSampler +from imblearn.datasets import make_imbalance # noqa: E402 +from imblearn.under_sampling import ClusterCentroids # noqa: E402 +from imblearn.under_sampling import NearMiss # noqa: E402 +from imblearn.over_sampling import RandomOverSampler # noqa: E402 -from imblearn.keras import BalancedBatchGenerator -from imblearn.keras import balanced_batch_generator +from imblearn.keras import BalancedBatchGenerator # noqa: E402 +from imblearn.keras import balanced_batch_generator # noqa: E402 @pytest.fixture @@ -112,7 +112,9 @@ def test_balanced_batch_generator_function(data, sampler, sample_weight): random_state=42, ) model.fit_generator( - generator=training_generator, steps_per_epoch=steps_per_epoch, epochs=10, + generator=training_generator, + steps_per_epoch=steps_per_epoch, + epochs=10, ) diff --git a/imblearn/metrics/_classification.py b/imblearn/metrics/_classification.py index 34cd8ee52..6f450598a 100644 --- a/imblearn/metrics/_classification.py +++ b/imblearn/metrics/_classification.py @@ -147,11 +147,11 @@ def sensitivity_specificity_support( >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig']) >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog']) >>> sensitivity_specificity_support(y_true, y_pred, average='macro') - (0.33333333333333331, 0.66666666666666663, None) + (0.33..., 0.66..., None) >>> sensitivity_specificity_support(y_true, y_pred, average='micro') - (0.33333333333333331, 0.66666666666666663, None) + (0.33..., 0.66..., None) >>> sensitivity_specificity_support(y_true, y_pred, average='weighted') - (0.33333333333333331, 0.66666666666666663, None) + (0.33..., 0.66..., None) """ average_options = (None, "micro", "macro", "weighted", "samples") if average not in average_options and average != "binary": @@ -367,11 +367,11 @@ def sensitivity_score( >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> sensitivity_score(y_true, y_pred, average='macro') - 0.33333333333333331 + 0.33... >>> sensitivity_score(y_true, y_pred, average='micro') - 0.33333333333333331 + 0.33... >>> sensitivity_score(y_true, y_pred, average='weighted') - 0.33333333333333331 + 0.33... >>> sensitivity_score(y_true, y_pred, average=None) array([ 1., 0., 0.]) """ @@ -468,11 +468,11 @@ def specificity_score( >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> specificity_score(y_true, y_pred, average='macro') - 0.66666666666666663 + 0.66... >>> specificity_score(y_true, y_pred, average='micro') - 0.66666666666666663 + 0.66... >>> specificity_score(y_true, y_pred, average='weighted') - 0.66666666666666663 + 0.66... >>> specificity_score(y_true, y_pred, average=None) array([ 0.75, 0.5 , 0.75]) """ @@ -597,15 +597,15 @@ class is unrecognized by the classifier, G-mean resolves to zero. To >>> geometric_mean_score(y_true, y_pred) 0.0 >>> geometric_mean_score(y_true, y_pred, correction=0.001) - 0.010000000000000004 + 0.010... >>> geometric_mean_score(y_true, y_pred, average='macro') - 0.47140452079103168 + 0.471... >>> geometric_mean_score(y_true, y_pred, average='micro') - 0.47140452079103168 + 0.471... >>> geometric_mean_score(y_true, y_pred, average='weighted') - 0.47140452079103168 + 0.471... >>> geometric_mean_score(y_true, y_pred, average=None) - array([ 0.8660254, 0. , 0. ]) + array([ 0.866..., 0. , 0. ]) """ if average is None or average != "multiclass": sen, spe, _ = sensitivity_specificity_support( @@ -721,7 +721,7 @@ def make_index_balanced_accuracy(*, alpha=0.1, squared=True): >>> y_true = [1, 0, 0, 1, 0, 1] >>> y_pred = [0, 0, 1, 1, 0, 1] >>> print(gmean(y_true, y_pred, average=None)) - [ 0.44444444 0.44444444] + [ 0.44... 0.44...] """ def decorate(scoring_func): @@ -1033,7 +1033,7 @@ def macro_averaged_mean_absolute_error(y_true, y_pred, *, sample_weight=None): >>> macro_averaged_mean_absolute_error(y_true_balanced, y_pred) 0.5 >>> macro_averaged_mean_absolute_error(y_true_imbalanced, y_pred) - 0.16666666666666666 + 0.16... """ _, y_true, y_pred = _check_targets(y_true, y_pred) if sample_weight is not None: diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index 6a7e50f0d..47a37a5b8 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -206,7 +206,12 @@ def test_geometric_mean_support_binary(): ([0, 0, 0, 0], [0, 0, 0, 0], 0.001, 1.0), ([0, 0, 0, 0], [1, 1, 1, 1], 0.001, 0.001), ([0, 0, 1, 1], [0, 1, 1, 0], 0.001, 0.5), - ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], 0.001, (0.001 ** 2) ** (1 / 3),), + ( + [0, 1, 2, 0, 1, 2], + [0, 2, 1, 0, 0, 1], + 0.001, + (0.001 ** 2) ** (1 / 3), + ), ([0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], 0.001, 1), ([0, 1, 1, 1, 1, 0], [0, 0, 1, 1, 1, 1], 0.001, (0.5 * 0.75) ** 0.5), ], @@ -255,7 +260,11 @@ def test_geometric_mean_sample_weight( y_true, y_pred, sample_weight, average, expected_gmean ): gmean = geometric_mean_score( - y_true, y_pred, labels=[0, 1], sample_weight=sample_weight, average=average, + y_true, + y_pred, + labels=[0, 1], + sample_weight=sample_weight, + average=average, ) assert gmean == pytest.approx(expected_gmean, rel=R_TOL) @@ -471,7 +480,7 @@ def test_classification_report_imbalanced_dict(): "avg_iba", "total_support", } - expected_inner_keys = {'spe', 'f1', 'sup', 'rec', 'geo', 'iba', 'pre'} + expected_inner_keys = {"spe", "f1", "sup", "rec", "geo", "iba", "pre"} assert outer_keys == expected_outer_keys assert inner_keys == expected_inner_keys @@ -484,7 +493,6 @@ def test_classification_report_imbalanced_dict(): ([1, 1, 1, 1, 1, 2], [1, 2, 1, 2, 1, 2], 0.2), ([1, 1, 1, 2, 2, 2, 3, 3, 3], [1, 3, 1, 2, 1, 1, 2, 3, 3], 0.555), ([1, 1, 1, 1, 1, 1, 2, 3, 3], [1, 3, 1, 2, 1, 1, 2, 3, 3], 0.166), - ], ) def test_macro_averaged_mean_absolute_error(y_true, y_pred, expected_ma_mae): @@ -500,7 +508,9 @@ def test_macro_averaged_mean_absolute_error_sample_weight(): sample_weight = [1, 1, 1, 1, 1, 1] ma_mae_unit_weights = macro_averaged_mean_absolute_error( - y_true, y_pred, sample_weight=sample_weight, + y_true, + y_pred, + sample_weight=sample_weight, ) assert ma_mae_unit_weights == pytest.approx(ma_mae_no_weights) diff --git a/imblearn/metrics/tests/test_score_objects.py b/imblearn/metrics/tests/test_score_objects.py index c62458f66..88c7d2c93 100644 --- a/imblearn/metrics/tests/test_score_objects.py +++ b/imblearn/metrics/tests/test_score_objects.py @@ -41,7 +41,10 @@ def test_scorer_common_average(data, score, expected_score, average): scorer = make_scorer(score, pos_label=None, average=average) grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={"C": [1, 10]}, scoring=scorer, cv=3, + LinearSVC(random_state=0), + param_grid={"C": [1, 10]}, + scoring=scorer, + cv=3, ) grid.fit(X_train, y_train).predict(X_test) @@ -55,7 +58,11 @@ def test_scorer_common_average(data, score, expected_score, average): (sensitivity_score, "binary", 0.92), (specificity_score, "binary", 0.95), (geometric_mean_score, "multiclass", 0.92), - (make_index_balanced_accuracy()(geometric_mean_score), "multiclass", 0.84,), + ( + make_index_balanced_accuracy()(geometric_mean_score), + "multiclass", + 0.84, + ), ], ) def test_scorer_default_average(data, score, average, expected_score): @@ -63,7 +70,10 @@ def test_scorer_default_average(data, score, average, expected_score): scorer = make_scorer(score, pos_label=1, average=average) grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={"C": [1, 10]}, scoring=scorer, cv=3, + LinearSVC(random_state=0), + param_grid={"C": [1, 10]}, + scoring=scorer, + cv=3, ) grid.fit(X_train, y_train).predict(X_test) diff --git a/imblearn/over_sampling/_smote/cluster.py b/imblearn/over_sampling/_smote/cluster.py index 55e041776..b5074c5e2 100644 --- a/imblearn/over_sampling/_smote/cluster.py +++ b/imblearn/over_sampling/_smote/cluster.py @@ -200,9 +200,6 @@ def _fit_resample(self, X, y): if n_samples == 0: continue - # target_class_indices = np.flatnonzero(y == class_sample) - # X_class = _safe_indexing(X, target_class_indices) - X_clusters = self.kmeans_estimator_.fit_predict(X) valid_clusters = [] cluster_sparsities = [] diff --git a/imblearn/over_sampling/_smote/tests/test_kmeans_smote.py b/imblearn/over_sampling/_smote/tests/test_kmeans_smote.py index 504e4f28e..05d3e20b9 100644 --- a/imblearn/over_sampling/_smote/tests/test_kmeans_smote.py +++ b/imblearn/over_sampling/_smote/tests/test_kmeans_smote.py @@ -6,6 +6,7 @@ from sklearn.cluster import KMeans from sklearn.cluster import MiniBatchKMeans +from sklearn.datasets import make_classification from sklearn.neighbors import NearestNeighbors from imblearn.over_sampling import KMeansSMOTE @@ -87,24 +88,21 @@ def test_sample_kmeans_custom(data, k_neighbors, kmeans_estimator): assert kmeans_smote.kmeans_estimator_.n_clusters == 3 -def test_sample_kmeans_not_enough_clusters(): - rng = np.random.RandomState(42) - X = rng.randn(30, 2) - y = np.array([1] * 20 + [0] * 10) - - smote = KMeansSMOTE(random_state=42, kmeans_estimator=30, k_neighbors=2) +def test_sample_kmeans_not_enough_clusters(data): + X, y = data + smote = KMeansSMOTE(cluster_balance_threshold=10, random_state=42) with pytest.raises(RuntimeError): smote.fit_resample(X, y) -@pytest.mark.parametrize("density_exponent", ["auto", 2]) -@pytest.mark.parametrize("cluster_balance_threshold", ["auto", 0.8]) -def test_sample_kmeans_density_estimation( - data, density_exponent, cluster_balance_threshold -): - X, y = data +@pytest.mark.parametrize("density_exponent", ["auto", 10]) +@pytest.mark.parametrize("cluster_balance_threshold", ["auto", 0.1]) +def test_sample_kmeans_density_estimation(density_exponent, cluster_balance_threshold): + X, y = make_classification( + n_samples=10_000, n_classes=2, weights=[0.3, 0.7], random_state=42 + ) smote = KMeansSMOTE( - random_state=42, + random_state=0, density_exponent=density_exponent, cluster_balance_threshold=cluster_balance_threshold, ) diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py index 819682e2e..173547b7f 100644 --- a/imblearn/over_sampling/tests/test_adasyn.py +++ b/imblearn/over_sampling/tests/test_adasyn.py @@ -127,7 +127,10 @@ def test_ada_fit_resample_nn_obj(): @pytest.mark.parametrize( "adasyn_params, err_msg", [ - ({"sampling_strategy": {0: 9, 1: 12}}, "No samples will be generated.",), + ( + {"sampling_strategy": {0: 9, 1: 12}}, + "No samples will be generated.", + ), ({"n_neighbors": "rnd"}, "has to be one of"), ], ) diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index a30738d0a..04b9fc05d 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -244,8 +244,13 @@ def test_random_over_sampler_shrinkage_behaviour(data): ({}, "`shrinkage` should contain a shrinkage factor for each class"), (-1, "The shrinkage factor needs to be >= 0"), ({0: -1}, "The shrinkage factor needs to be >= 0"), - ([1, ], "`shrinkage` should either be a positive floating number or") - ] + ( + [ + 1, + ], + "`shrinkage` should either be a positive floating number or", + ), + ], ) def test_random_over_sampler_shrinkage_error(data, shrinkage, err_msg): # check the validation of the shrinkage parameter diff --git a/imblearn/tests/test_base.py b/imblearn/tests/test_base.py index e909958f0..47568f9cd 100644 --- a/imblearn/tests/test_base.py +++ b/imblearn/tests/test_base.py @@ -32,7 +32,8 @@ def test_function_sampler_reject_sparse(): X_sparse = sparse.csr_matrix(X) sampler = FunctionSampler(accept_sparse=False) with pytest.raises( - TypeError, match="A sparse matrix was passed, but dense data is required", + TypeError, + match="A sparse matrix was passed, but dense data is required", ): sampler.fit_resample(X_sparse, y) @@ -92,7 +93,7 @@ def dummy_sampler(X, y): pipeline = make_pipeline(sampler, LinearRegression()) y_pred = pipeline.fit(X, y).predict(X) - assert type_of_target(y_pred) == 'continuous' + assert type_of_target(y_pred) == "continuous" def test_function_resampler_fit(): diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py index 732d07b62..6bbb944d5 100644 --- a/imblearn/tests/test_common.py +++ b/imblearn/tests/test_common.py @@ -17,7 +17,6 @@ from imblearn.utils.estimator_checks import parametrize_with_checks from imblearn.utils.estimator_checks import _set_checking_parameters -from imblearn.utils.estimator_checks import _yield_all_checks from imblearn.utils.testing import all_estimators from imblearn.under_sampling import NearMiss @@ -55,7 +54,12 @@ def test_estimators_compatibility_sklearn(estimator, check, request): def test_estimators_imblearn(estimator, check, request): # Common tests for estimator instances with ignore_warnings( - category=(FutureWarning, ConvergenceWarning, UserWarning, FutureWarning,) + category=( + FutureWarning, + ConvergenceWarning, + UserWarning, + FutureWarning, + ) ): _set_checking_parameters(estimator) check(estimator) diff --git a/imblearn/tests/test_pipeline.py b/imblearn/tests/test_pipeline.py index b1f721b03..a0a9b9a89 100644 --- a/imblearn/tests/test_pipeline.py +++ b/imblearn/tests/test_pipeline.py @@ -52,8 +52,7 @@ class NoFit: - """Small class to test parameter dispatching. - """ + """Small class to test parameter dispatching.""" def __init__(self, a=None, b=None): self.a = a @@ -114,8 +113,7 @@ def score(self, X, y=None): class FitParamT(BaseEstimator): - """Mock classifier - """ + """Mock classifier""" def __init__(self): self.successful = False @@ -170,8 +168,7 @@ def fit_resample(self, X, y): class FitTransformSample(NoTrans): - """Estimator implementing both transform and sample - """ + """Estimator implementing both transform and sample""" def fit(self, X, y, should_succeed=False): pass @@ -353,7 +350,10 @@ def test_pipeline_methods_preprocessing_svm(): scaler = StandardScaler() pca = PCA(n_components=2, svd_solver="randomized", whiten=True) clf = SVC( - gamma="scale", probability=True, random_state=0, decision_function_shape="ovr", + gamma="scale", + probability=True, + random_state=0, + decision_function_shape="ovr", ) for preprocessing in [scaler, pca]: @@ -682,7 +682,8 @@ def test_pipeline_memory_transformer(): assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( - pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, + pipe.named_steps["transf"].means_, + cached_pipe.named_steps["transf"].means_, ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting @@ -694,7 +695,8 @@ def test_pipeline_memory_transformer(): assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( - pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, + pipe.named_steps["transf"].means_, + cached_pipe.named_steps["transf"].means_, ) assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators @@ -755,7 +757,8 @@ def test_pipeline_memory_sampler(): assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( - pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, + pipe.named_steps["transf"].means_, + cached_pipe.named_steps["transf"].means_, ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting @@ -767,7 +770,8 @@ def test_pipeline_memory_sampler(): assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( - pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, + pipe.named_steps["transf"].means_, + cached_pipe.named_steps["transf"].means_, ) assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators @@ -1187,7 +1191,7 @@ def test_resampler_last_stage_passthrough(): pipe.fit_resample(X, y) -def test_pipeline_score_samples_pca_lof(): +def test_pipeline_score_samples_pca_lof_binary(): X, y = make_classification( n_classes=2, class_sep=2, @@ -1233,7 +1237,8 @@ def test_score_samples_on_pipeline_without_score_samples(): def test_pipeline_param_error(): clf = make_pipeline(LogisticRegression()) with pytest.raises( - ValueError, match="Pipeline.fit does not accept the sample_weight parameter", + ValueError, + match="Pipeline.fit does not accept the sample_weight parameter", ): clf.fit([[0], [0]], [0, 1], sample_weight=[1, 1]) @@ -1317,7 +1322,7 @@ def test_verbose(est, method, pattern, capsys): assert re.match(pattern, capsys.readouterr().out) -def test_pipeline_score_samples_pca_lof(): +def test_pipeline_score_samples_pca_lof_multiclass(): X, y = load_iris(return_X_y=True) sampling_strategy = {0: 50, 1: 30, 2: 20} X, y = make_imbalance(X, y, sampling_strategy=sampling_strategy) diff --git a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py index 8148e2fdb..03d1970e5 100644 --- a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py @@ -70,7 +70,9 @@ def test_fit_resample_object(): sampling_strategy = "auto" cluster = KMeans(random_state=RND_SEED) cc = ClusterCentroids( - sampling_strategy=sampling_strategy, random_state=RND_SEED, estimator=cluster, + sampling_strategy=sampling_strategy, + random_state=RND_SEED, + estimator=cluster, ) X_resampled, y_resampled = cc.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py index 44999ddb5..4c1856525 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py @@ -135,7 +135,10 @@ def test_enn_check_kind_selection(): `check_sel="mode"`.""" X, y = make_classification( - n_samples=1000, n_classes=2, weights=[0.3, 0.7], random_state=0, + n_samples=1000, + n_classes=2, + weights=[0.3, 0.7], + random_state=0, ) enn_all = EditedNearestNeighbours(kind_sel="all") diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py index 1274eb4bc..fcfe1f56b 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py @@ -41,7 +41,9 @@ def test_iht_init(): sampling_strategy = "auto" iht = InstanceHardnessThreshold( - estimator=ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED, + estimator=ESTIMATOR, + sampling_strategy=sampling_strategy, + random_state=RND_SEED, ) assert iht.sampling_strategy == sampling_strategy @@ -58,7 +60,9 @@ def test_iht_fit_resample(): def test_iht_fit_resample_half(): sampling_strategy = {0: 3, 1: 3} iht = InstanceHardnessThreshold( - estimator=NB(), sampling_strategy=sampling_strategy, random_state=RND_SEED, + estimator=NB(), + sampling_strategy=sampling_strategy, + random_state=RND_SEED, ) X_resampled, y_resampled = iht.fit_resample(X, Y) assert X_resampled.shape == (6, 2) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py index f2cab39c8..3e2e8686c 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py @@ -224,7 +224,9 @@ def test_nm_fit_resample_nn_obj(): ] for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss( - sampling_strategy=sampling_strategy, version=version, n_neighbors=nn, + sampling_strategy=sampling_strategy, + version=version, + n_neighbors=nn, ) X_resampled, y_resampled = nm.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py index 89015531a..a0d07266e 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py @@ -63,7 +63,9 @@ def test_rus_fit_resample(as_frame): def test_rus_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} rus = RandomUnderSampler( - sampling_strategy=sampling_strategy, random_state=RND_SEED, replacement=True, + sampling_strategy=sampling_strategy, + random_state=RND_SEED, + replacement=True, ) X_resampled, y_resampled = rus.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py index a206ec2b3..348a620cd 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_repeated_edited_nearest_neighbours.py @@ -337,7 +337,8 @@ def test_renn_not_good_object(): @pytest.mark.parametrize( - "max_iter, n_iter", [(2, 2), (5, 3)], + "max_iter, n_iter", + [(2, 2), (5, 3)], ) def test_renn_iter_attribute(max_iter, n_iter): renn = RepeatedEditedNearestNeighbours(max_iter=max_iter) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 5e47a2ed9..724f38be3 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -348,7 +348,7 @@ def check_samplers_multiclass_ova(name, sampler_orig): weights=[0.2, 0.3, 0.5], random_state=0, ) - y_ova = label_binarize(y, np.unique(y)) + y_ova = label_binarize(y, classes=np.unique(y)) X_res, y_res = sampler.fit_resample(X, y) X_res_ova, y_res_ova = sampler.fit_resample(X, y_ova) assert_allclose(X_res, X_res_ova) diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 30c4a932f..c356d4881 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -284,8 +284,16 @@ def sampling_strategy_func(y, multiplier): @pytest.mark.parametrize( "sampling_strategy, sampling_type, expected_result", [ - ({3: 25, 1: 25, 2: 25}, "under-sampling", OrderedDict({1: 25, 2: 25, 3: 25}),), - ({3: 100, 1: 100, 2: 100}, "over-sampling", OrderedDict({1: 50, 2: 0, 3: 75}),), + ( + {3: 25, 1: 25, 2: 25}, + "under-sampling", + OrderedDict({1: 25, 2: 25, 3: 25}), + ), + ( + {3: 100, 1: 100, 2: 100}, + "over-sampling", + OrderedDict({1: 50, 2: 0, 3: 75}), + ), ], ) def test_sampling_strategy_check_order(