MNT Compatibility with sklearn 1.0 (#864)

scikit-learn-contrib · Sep 29, 2021 · f407976 · f407976
1 parent edf6eae
commit f407976
Show file tree

Hide file tree

Showing 30 changed files with 252 additions and 133 deletions.
diff --git a/.pep8speaks.yml b/.pep8speaks.yml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,19 +1,16 @@
 repos:
--   repo: https://github.com/python/black
-    rev: stable
-    hooks:
-    -   id: black
 -   repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v2.3.0
     hooks:
     -   id: check-yaml
     -   id: end-of-file-fixer
     -   id: trailing-whitespace
+-   repo: https://github.com/psf/black
+    rev: 21.6b0
+    hooks:
+    -   id: black
 -   repo: https://gitlab.com/pycqa/flake8
-    rev: 3.7.8
+    rev: 3.9.2
     hooks:
     -   id: flake8
         types: [file, python]
-        # only check for unused imports for now, as long as
-        # the code is not fully PEP8 compatible
-        args: [--select=F401]
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -3,23 +3,26 @@ jobs:
 - job: linting
   displayName: Linting
   pool:
-    vmImage: ubuntu-18.04
+    vmImage: ubuntu-20.04
   steps:
-    - bash: echo "##vso[task.prependpath]$CONDA/bin"
-      displayName: Add conda to PATH
-    - bash: sudo chown -R $USER $CONDA
-      displayName: Take ownership of conda installation
-    - bash: conda create --name flake8_env --yes flake8
-      displayName: Install flake8
+    - task: UsePythonVersion@0
+      inputs:
+        versionSpec: '3.9'
+    - bash: |
+        # Include pytest compatibility with mypy
+        pip install pytest flake8 mypy==0.782 black==21.6b0
+      displayName: Install linters
+    - bash: |
+        black --check .
+      displayName: Run black
     - bash: |
-        source activate flake8_env
         ./build_tools/circle/linting.sh
       displayName: Run linting
 
 - template: build_tools/azure/posix.yml
   parameters:
     name: Linux_Runs
-    vmImage: ubuntu-18.04
+    vmImage: ubuntu-20.04
     matrix:
       pylatest_pip_openblas_pandas:
         DISTRIB: 'conda-pip-latest'
@@ -33,15 +36,14 @@ jobs:
 - template: build_tools/azure/posix.yml
   parameters:
     name: Linux
-    vmImage: ubuntu-18.04
+    vmImage: ubuntu-20.04
     dependsOn: [linting]
     matrix:
       # Linux environment to test that scikit-learn can be built against
       # versions of numpy, scipy with ATLAS that comes with Ubuntu Bionic 18.04
       # i.e. numpy 1.13.3 and scipy 0.19
       py36_ubuntu_atlas:
         DISTRIB: 'ubuntu'
-        PYTHON_VERSION: '3.6'
         JOBLIB_VERSION: '*'
       pylatest_conda_pandas_keras:
         DISTRIB: 'conda'
@@ -61,11 +63,16 @@ jobs:
         TENSORFLOW_VERSION: '*'
         COVERAGE: 'true'
         TEST_DOCSTRINGS: 'true'
+      sklearn_0_24_conda:
+        DISTRIB: 'conda'
+        PYTHON_VERSION: '3.7'
+        SKLEARN_VERSION: '0.24.2'
+        INSTALL_MKL: 'true'
 
 - template: build_tools/azure/posix-32.yml
   parameters:
     name: Linux32
-    vmImage: ubuntu-18.04
+    vmImage: ubuntu-20.04
     dependsOn: [linting]
     matrix:
       py36_ubuntu_atlas_32bit:

diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
@@ -32,10 +32,15 @@ if [[ "$DISTRIB" == "conda" ]]; then
     fi
 
     make_conda $TO_INSTALL
-    python -m pip install scikit-learn
 
     TO_INSTALL=""
 
+    if [[ -n "$SKLEARN_VERSION" ]]; then
+        TO_INSTALL="$TO_INSTALL scikit-learn=$SKLEARN_VERSION"
+    else
+        TO_INSTALL="$TO_INSTALL scikit-learn"
+    fi
+
     if [[ -n "$PANDAS_VERSION" ]]; then
         TO_INSTALL="$TO_INSTALL pandas=$PANDAS_VERSION"
     fi

diff --git a/doc/install.rst b/doc/install.rst
@@ -12,7 +12,7 @@ The imbalanced-learn package requires the following dependencies:
 * python (>=3.6)
 * numpy (>=1.13.3)
 * scipy (>=0.19.1)
-* scikit-learn (>=0.23)
+* scikit-learn (>=0.24)
 * keras 2 (optional)
 * tensorflow (optional)
 

diff --git a/doc/sphinxext/sphinx_issues.py b/doc/sphinxext/sphinx_issues.py
@@ -80,7 +80,11 @@ class IssueRole(object):
     EXTERNAL_REPO_REGEX = re.compile(r"^(\w+)/(.+)([#@])([\w]+)$")
 
     def __init__(
-        self, uri_config_option, format_kwarg, github_uri_template, format_text=None,
+        self,
+        uri_config_option,
+        format_kwarg,
+        github_uri_template,
+        format_text=None,
     ):
         self.uri_config_option = uri_config_option
         self.format_kwarg = format_kwarg
@@ -103,7 +107,9 @@ def make_node(self, name, issue_no, config, options=None):
                 )
             path = name_map.get(name)
             ref = "https://github.com/{issues_github_path}/{path}/{n}".format(
-                issues_github_path="{}/{}".format(username, repo), path=path, n=issue,
+                issues_github_path="{}/{}".format(username, repo),
+                path=path,
+                n=issue,
             )
             formatted_issue = self.format_text(issue).lstrip("#")
             text = "{username}/{repo}{symbol}{formatted_issue}".format(**locals())

diff --git a/doc/whats_new/v0.8.rst b/doc/whats_new/v0.8.rst
@@ -1,5 +1,18 @@
 .. _changes_0_8:
 
+Version 0.8.1
+=============
+
+**In development**
+
+Changelog
+
+Maintenance
+...........
+
+- Make `imbalanced-learn` compatible with `scikit-learn` 1.0.
+  :pr:`864` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Version 0.8.0
 =============
 

diff --git a/imblearn/base.py b/imblearn/base.py
@@ -82,7 +82,9 @@ def fit_resample(self, X, y):
 
         output = self._fit_resample(X, y)
 
-        y_ = label_binarize(output[1], np.unique(y)) if binarize_y else output[1]
+        y_ = (
+            label_binarize(output[1], classes=np.unique(y)) if binarize_y else output[1]
+        )
 
         X_, y_ = arrays_transformer.transform(output[0], y_)
         return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
@@ -284,7 +286,11 @@ def fit_resample(self, X, y):
 
         if self.validate:
 
-            y_ = label_binarize(output[1], np.unique(y)) if binarize_y else output[1]
+            y_ = (
+                label_binarize(output[1], classes=np.unique(y))
+                if binarize_y
+                else output[1]
+            )
             X_, y_ = arrays_transformer.transform(output[0], y_)
             return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
 

diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py
@@ -422,15 +422,13 @@ def fit(self, X, y, sample_weight=None):
         )
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X)
+        self._n_features = X.shape[1]
 
         if issparse(X):
             # Pre-sort indices to avoid that each individual tree of the
             # ensemble sorts the indices.
             X.sort_indices()
 
-        # Remap output
-        _, self.n_features_ = X.shape
-
         y = np.atleast_1d(y)
         if y.ndim == 2 and y.shape[1] == 1:
             warn(
@@ -627,5 +625,13 @@ def _set_oob_score(self, X, y):
 
         self.oob_score_ = oob_score / self.n_outputs_
 
+    @property
+    def n_features_(self):
+        """Number of features when fitting the estimator."""
+        return getattr(self.n_features_in_, self._n_features)
+
     def _more_tags(self):
-        return {"multioutput": False}
+        return {
+            "multioutput": False,
+            "multilabel": False,
+        }
diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py
@@ -48,7 +48,10 @@
 def test_easy_ensemble_classifier(n_estimators, base_estimator):
     # Check classification for various parameter settings.
     X, y = make_imbalance(
-        iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0,
+        iris.data,
+        iris.target,
+        sampling_strategy={0: 20, 1: 25, 2: 50},
+        random_state=0,
     )
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 
@@ -72,7 +75,10 @@ def test_easy_ensemble_classifier(n_estimators, base_estimator):
 def test_base_estimator():
     # Check base_estimator and its default values.
     X, y = make_imbalance(
-        iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0,
+        iris.data,
+        iris.target,
+        sampling_strategy={0: 20, 1: 25, 2: 50},
+        random_state=0,
     )
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 
@@ -91,7 +97,10 @@ def test_base_estimator():
 
 def test_bagging_with_pipeline():
     X, y = make_imbalance(
-        iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0,
+        iris.data,
+        iris.target,
+        sampling_strategy={0: 20, 1: 25, 2: 50},
+        random_state=0,
     )
     estimator = EasyEnsembleClassifier(
         n_estimators=2,
@@ -109,7 +118,9 @@ def test_warm_start(random_state=42):
     for n_estimators in [5, 10]:
         if clf_ws is None:
             clf_ws = EasyEnsembleClassifier(
-                n_estimators=n_estimators, random_state=random_state, warm_start=True,
+                n_estimators=n_estimators,
+                random_state=random_state,
+                warm_start=True,
             )
         else:
             clf_ws.set_params(n_estimators=n_estimators)
@@ -182,7 +193,10 @@ def test_warm_start_equivalence():
 )
 def test_easy_ensemble_classifier_error(n_estimators, msg_error):
     X, y = make_imbalance(
-        iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0,
+        iris.data,
+        iris.target,
+        sampling_strategy={0: 20, 1: 25, 2: 50},
+        random_state=0,
     )
     with pytest.raises(ValueError, match=msg_error):
         eec = EasyEnsembleClassifier(n_estimators=n_estimators)
@@ -191,7 +205,10 @@ def test_easy_ensemble_classifier_error(n_estimators, msg_error):
 
 def test_easy_ensemble_classifier_single_estimator():
     X, y = make_imbalance(
-        iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0,
+        iris.data,
+        iris.target,
+        sampling_strategy={0: 20, 1: 25, 2: 50},
+        random_state=0,
     )
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 
@@ -205,14 +222,19 @@ def test_easy_ensemble_classifier_single_estimator():
 
 def test_easy_ensemble_classifier_grid_search():
     X, y = make_imbalance(
-        iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0,
+        iris.data,
+        iris.target,
+        sampling_strategy={0: 20, 1: 25, 2: 50},
+        random_state=0,
     )
 
     parameters = {
         "n_estimators": [1, 2],
         "base_estimator__n_estimators": [3, 4],
     }
     grid_search = GridSearchCV(
-        EasyEnsembleClassifier(base_estimator=AdaBoostClassifier()), parameters, cv=5,
+        EasyEnsembleClassifier(base_estimator=AdaBoostClassifier()),
+        parameters,
+        cv=5,
     )
     grid_search.fit(X, y)
diff --git a/imblearn/ensemble/tests/test_forest.py b/imblearn/ensemble/tests/test_forest.py
@@ -32,7 +32,10 @@ def imbalanced_dataset():
     [
         ({"n_estimators": "whatever"}, "n_estimators must be an integer"),
         ({"n_estimators": -100}, "n_estimators must be greater than zero"),
-        ({"bootstrap": False, "oob_score": True}, "Out of bag estimation only",),
+        (
+            {"bootstrap": False, "oob_score": True},
+            "Out of bag estimation only",
+        ),
     ],
 )
 def test_balanced_random_forest_error(imbalanced_dataset, forest_params, err_msg):
@@ -105,7 +108,10 @@ def test_balanced_random_forest_oob(imbalanced_dataset):
         X, y, random_state=42, stratify=y
     )
     est = BalancedRandomForestClassifier(
-        oob_score=True, random_state=0, n_estimators=1000, min_samples_leaf=2,
+        oob_score=True,
+        random_state=0,
+        n_estimators=1000,
+        min_samples_leaf=2,
     )
 
     est.fit(X_train, y_train)
@@ -135,12 +141,16 @@ def test_little_tree_with_small_max_samples():
 
     # First fit with no restriction on max samples
     est1 = BalancedRandomForestClassifier(
-        n_estimators=1, random_state=rng, max_samples=None,
+        n_estimators=1,
+        random_state=rng,
+        max_samples=None,
     )
 
     # Second fit with max samples restricted to just 2
     est2 = BalancedRandomForestClassifier(
-        n_estimators=1, random_state=rng, max_samples=2,
+        n_estimators=1,
+        random_state=rng,
+        max_samples=2,
     )
 
     est1.fit(X, y)