From 0b188101e0a3cd9a5f9815b707c4c74ac4aa59d4 Mon Sep 17 00:00:00 2001
From: Aivar Sootla <asootla@dwavesys.com>
Date: Fri, 15 Mar 2024 16:20:47 +0000
Subject: [PATCH 01/12] Update objective computation: multiply quadratic biases
 with (1-alpha)

---
 dwave/plugins/sklearn/transformers.py         | 30 ++++++++++++-------
 .../algorithm-update-d73cd4f7c854a8b3.yaml    |  3 ++
 2 files changed, 22 insertions(+), 11 deletions(-)
 create mode 100644 releasenotes/notes/algorithm-update-d73cd4f7c854a8b3.yaml

diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py
index aa186be..f0fe455 100644
--- a/dwave/plugins/sklearn/transformers.py
+++ b/dwave/plugins/sklearn/transformers.py
@@ -185,7 +185,8 @@ def correlation_cqm(
             label=f"{num_features}-hot",
             )
 
-        with tempfile.TemporaryFile() as fX, tempfile.TemporaryFile() as fout:
+        with tempfile.TemporaryFile() as fX, tempfile.TemporaryFile() as fout,\
+             tempfile.TemporaryFile() as targetcor_out:
             # we make a copy of X because we'll be modifying it in-place within
             # some of the functions
             X_copy = np.memmap(fX, X.dtype, mode="w+", shape=(X.shape[0], X.shape[1] + 1))
@@ -199,23 +200,30 @@ def correlation_cqm(
                 mode="w+",
                 shape=(X_copy.shape[1], X_copy.shape[1]),
                 )
-
+            # make the matrix that will hold the output correlations
+            target_correlations = np.memmap(
+                targetcor_out,
+                dtype=np.result_type(X, y),
+                mode="w+",
+                shape=(X_copy.shape[1], 1),
+                )
             # main calculation. It modifies X_copy in-place
             corrcoef(X_copy, out=correlations, rowvar=False, copy=False)
-
             # we don't care about the direction of correlation in terms of
             # the penalty/quality
             np.absolute(correlations, out=correlations)
-
+            # copying the correlations with the target column
+            target_correlations = correlations[:, -1].copy()
+            # multiplying all terms with (1 - alpha)
+            np.multiply(correlations, (1 - alpha), out=correlations)
             # our objective
-            # we multiply by 2 because the matrix is symmetric
-            np.fill_diagonal(correlations, correlations[:, -1] * (-2 * alpha * num_features))
-
-            # Note: the full symmetric matrix (with both upper- and lower-diagonal
-            # entries for each correlation coefficient) is retained for consistency with
-            # the original formulation from Milne et al.
+            # we multiply by num_features to have consistent performance 
+            # with the increase of the number of features
+            np.fill_diagonal(correlations, target_correlations * (- alpha * num_features))
+            # Note: we only add terms on and above the diagonal
             it = np.nditer(correlations[:-1, :-1], flags=['multi_index'], op_flags=[['readonly']])
-            cqm.set_objective((*it.multi_index, x) for x in it if x)
+            cqm.set_objective((*it.multi_index, x) for x in it 
+                              if it.multi_index[0] <= it.multi_index[1] and x)
 
         return cqm
 
diff --git a/releasenotes/notes/algorithm-update-d73cd4f7c854a8b3.yaml b/releasenotes/notes/algorithm-update-d73cd4f7c854a8b3.yaml
new file mode 100644
index 0000000..dc85f5f
--- /dev/null
+++ b/releasenotes/notes/algorithm-update-d73cd4f7c854a8b3.yaml
@@ -0,0 +1,3 @@
+---
+fixes:
+  - Multiply off-diagonal terms of the correlation matrix with 1-alpha.
\ No newline at end of file

From 6981c61d8a59fb655d27fc67063bd7beb8194823 Mon Sep 17 00:00:00 2001
From: Aivar Sootla <asootla@dwavesys.com>
Date: Wed, 20 Mar 2024 10:01:22 +0000
Subject: [PATCH 02/12] address comments: replace a tempfile with direct
 computations, add a test

---
 dwave/plugins/sklearn/transformers.py | 20 +++++---------------
 tests/test_transformer.py             |  4 ++++
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py
index f0fe455..ada1ad3 100644
--- a/dwave/plugins/sklearn/transformers.py
+++ b/dwave/plugins/sklearn/transformers.py
@@ -185,8 +185,7 @@ def correlation_cqm(
             label=f"{num_features}-hot",
             )
 
-        with tempfile.TemporaryFile() as fX, tempfile.TemporaryFile() as fout,\
-             tempfile.TemporaryFile() as targetcor_out:
+        with tempfile.TemporaryFile() as fX, tempfile.TemporaryFile() as fout:
             # we make a copy of X because we'll be modifying it in-place within
             # some of the functions
             X_copy = np.memmap(fX, X.dtype, mode="w+", shape=(X.shape[0], X.shape[1] + 1))
@@ -199,27 +198,18 @@ def correlation_cqm(
                 dtype=np.result_type(X, y),
                 mode="w+",
                 shape=(X_copy.shape[1], X_copy.shape[1]),
-                )
-            # make the matrix that will hold the output correlations
-            target_correlations = np.memmap(
-                targetcor_out,
-                dtype=np.result_type(X, y),
-                mode="w+",
-                shape=(X_copy.shape[1], 1),
-                )
+                )            
             # main calculation. It modifies X_copy in-place
             corrcoef(X_copy, out=correlations, rowvar=False, copy=False)
             # we don't care about the direction of correlation in terms of
             # the penalty/quality
             np.absolute(correlations, out=correlations)
-            # copying the correlations with the target column
-            target_correlations = correlations[:, -1].copy()
-            # multiplying all terms with (1 - alpha)
-            np.multiply(correlations, (1 - alpha), out=correlations)
+            # multiplying all but last columns and rows with (1 - alpha)
+            np.multiply(correlations[:-1, :-1], (1 - alpha), out=correlations[:-1, :-1])
             # our objective
             # we multiply by num_features to have consistent performance 
             # with the increase of the number of features
-            np.fill_diagonal(correlations, target_correlations * (- alpha * num_features))
+            np.fill_diagonal(correlations, correlations[:, -1] * (- alpha * num_features))
             # Note: we only add terms on and above the diagonal
             it = np.nditer(correlations[:-1, :-1], flags=['multi_index'], op_flags=[['readonly']])
             cqm.set_objective((*it.multi_index, x) for x in it 
diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index 14214cc..6e0ef3e 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -126,6 +126,10 @@ def test_alpha_0(self):
         cqm = SelectFromQuadraticModel.correlation_cqm(self.X, self.y, num_features=3, alpha=0)
         self.assertTrue(not any(cqm.objective.linear.values()))
 
+    def test_alpha_1_no_quadratic(self):
+        cqm = SelectFromQuadraticModel.correlation_cqm(self.X, self.y, num_features=3, alpha=1)
+        self.assertTrue(not any(cqm.objective.quadratic.values()))
+        
     def test_alpha_1(self):
         rng = np.random.default_rng(42)
 

From 25634b69e1d893c161a8661a632e0402cd5d80d0 Mon Sep 17 00:00:00 2001
From: Aivar Sootla <asootla@dwavesys.com>
Date: Tue, 2 Apr 2024 14:43:15 +0000
Subject: [PATCH 03/12] [WIP] adding feature selection using (conditional)
 mutual information

---
 dwave/plugins/sklearn/transformers.py         | 154 ++++++++--
 dwave/plugins/sklearn/utilities.py            | 274 ++++++++++++++++++
 .../notes/mutual-info-ddd1a23e320f88ae.yaml   |   4 +
 tests/test_utilities.py                       | 104 ++++++-
 4 files changed, 504 insertions(+), 32 deletions(-)
 create mode 100644 releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml

diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py
index ada1ad3..9773e99 100644
--- a/dwave/plugins/sklearn/transformers.py
+++ b/dwave/plugins/sklearn/transformers.py
@@ -30,7 +30,7 @@
 from sklearn.feature_selection import SelectorMixin
 from sklearn.utils.validation import check_is_fitted
 
-from dwave.plugins.sklearn.utilities import corrcoef
+from dwave.plugins.sklearn.utilities import corrcoef, estimate_mi_matrix
 
 __all__ = ["SelectFromQuadraticModel"]
 
@@ -57,7 +57,7 @@ class SelectFromQuadraticModel(SelectorMixin, BaseEstimator):
 
     ACCEPTED_METHODS = [
         "correlation",
-        # "mutual information",  # todo
+        "mutual_information",
         ]
 
     def __init__(
@@ -153,27 +153,6 @@ def correlation_cqm(
             https://1qbit.com/whitepaper/optimal-feature-selection-in-credit-scoring-classification-using-quantum-annealer
         """
 
-        X = np.atleast_2d(np.asarray(X))
-        y = np.asarray(y)
-
-        if X.ndim != 2:
-            raise ValueError("X must be a 2-dimensional array-like")
-
-        if y.ndim != 1:
-            raise ValueError("y must be a 1-dimensional array-like")
-
-        if y.shape[0] != X.shape[0]:
-            raise ValueError(f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}")
-
-        if not 0 <= alpha <= 1:
-            raise ValueError(f"alpha must be between 0 and 1, given {alpha}")
-
-        if num_features <= 0:
-            raise ValueError(f"num_features must be a positive integer, given {num_features}")
-
-        if X.shape[0] <= 1:
-            raise ValueError("X must have at least two rows")
-
         cqm = dimod.ConstrainedQuadraticModel()
         cqm.add_variables(dimod.BINARY, X.shape[1])
 
@@ -212,11 +191,113 @@ def correlation_cqm(
             np.fill_diagonal(correlations, correlations[:, -1] * (- alpha * num_features))
             # Note: we only add terms on and above the diagonal
             it = np.nditer(correlations[:-1, :-1], flags=['multi_index'], op_flags=[['readonly']])
-            cqm.set_objective((*it.multi_index, x) for x in it 
-                              if it.multi_index[0] <= it.multi_index[1] and x)
+            print(correlations)
+            cqm.set_objective((*it.multi_index, x) for x in it if x)
 
         return cqm
 
+    @staticmethod
+    def mutual_information_cqm(        
+        X: npt.ArrayLike,
+        y: npt.ArrayLike,
+        *,
+        num_features: int,
+        alpha: float = 0.5,
+        strict: bool = True,
+        conditional: bool = True,
+        discrete_features: typing.Union[str, npt.ArrayLike] = "auto",
+        discrete_target: bool = False,
+        copy: bool = True,
+        n_neighbors: int = 4,
+        n_workers: int = 1,
+        random_state: typing.Union[None, int] = None,
+        ) -> dimod.ConstrainedQuadraticModel:
+        """Build a constrained quadratic model for feature selection.
+
+        If ``conditional`` is True then the conditional mutual information 
+        criterion from [2] is used, and if ``conditional`` is False then
+        mutual information based criterion from [1] is used.
+        
+        For computation of mutual information and conditional mutual information 
+        
+        
+        Args:
+            X:
+                Feature vectors formatted as a numerical 2D array-like.
+            y:
+                Class labels formatted as a numerical 1D array-like.
+            alpha:
+                Hyperparameter between 0 and 1 that controls the relative weight of
+                the relevance and redundancy terms.
+                ``alpha=0`` places no weight on the quality of the features,
+                therefore the features will be selected as to minimize the
+                redundancy without any consideration to quality.
+                ``alpha=1`` places the maximum weight on the quality of the features,
+                and therefore will be equivalent to using
+                :class:`sklearn.feature_selection.SelectKBest`.
+            num_features:
+                The number of features to select.
+            strict:
+                If ``False`` the constraint on the number of selected features
+                is ``<=`` rather than ``==``.
+            conditional: bool, default=True
+                Whether to compute the off-diagonal terms using the conditional mutual
+                information or joint mutual information
+            discrete_features: 
+                See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+            discrete_target:
+                See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+            n_neighbors:
+                See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+            copy:
+                See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+            random_state:
+                See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+            n_workers: int, default=1
+                Number of workers for parallel computation on the cpu    
+
+        Returns:
+            A constrained quadratic model.
+
+        References:    
+        .. [1] Peng, F. Long, and C. Ding. Feature selection based on mutual information criteria of max-dependency,
+               max-relevance, and min-redundancy. IEEE Transactions on pattern analysis and machine intelligence,
+               27(8):1226–1238, 2005.
+        .. [2] X. V. Nguyen, J. Chan, S. Romano, and J. Bailey. Effective global approaches for mutual information 
+               based feature selection. In Proceedings of the 20th ACM SIGKDD international conference on 
+               Knowledge discovery and data mining, pages 512–521. ACM, 2014.
+        """
+        
+        cqm = dimod.ConstrainedQuadraticModel()
+        cqm.add_variables(dimod.BINARY, X.shape[1])
+
+        # add the k-hot constraint
+        cqm.add_constraint(
+            ((v, 1) for v in cqm.variables),
+            '==' if strict else '<=',
+            num_features,
+            label=f"{num_features}-hot",
+            )
+
+        mi = estimate_mi_matrix(
+            X, y, discrete_features, discrete_target,
+            n_neighbors=n_neighbors, copy=copy,
+            random_state=random_state, n_workers=n_workers,
+            conditional=conditional)
+        
+        if not conditional:
+            # mutliplying all features with num_features
+            np.multiply(mi, num_features, out=mi)
+            # mutpliypling off-diagonal ones with -(1-alpha)
+            np.multiply(mi, -(1 - alpha), out=mi)
+            # mutpliypling off-diagonal ones with alpha
+            diagonal = alpha * np.diag(mi)
+            np.fill_diagonal(mi, diagonal)
+        
+        it = np.nditer(mi, flags=['multi_index'], op_flags=[['readonly']])
+        cqm.set_objective((*it.multi_index, x) for x in it if x)
+        return cqm        
+
     def fit(
         self,
         X: npt.ArrayLike,
@@ -225,6 +306,7 @@ def fit(
         alpha: typing.Optional[float] = None,
         num_features: typing.Optional[int] = None,
         time_limit: typing.Optional[float] = None,
+        **kwargs
     ) -> SelectFromQuadraticModel:
         """Select the features to keep.
 
@@ -253,18 +335,30 @@ def fit(
             This instance of `SelectFromQuadraticModel`.
         """
         X = np.atleast_2d(np.asarray(X))
+        y = np.asarray(y)
+        
         if X.ndim != 2:
             raise ValueError("X must be a 2-dimensional array-like")
+        if y.ndim != 1:
+            raise ValueError("y must be a 1-dimensional array-like")
 
-        # y is checked by the correlation method function
+        if y.shape[0] != X.shape[0]:
+            raise ValueError(f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}")
+
+        if X.shape[0] <= 1:
+            raise ValueError("X must have at least two rows") 
 
         if alpha is None:
             alpha = self.alpha
-        # alpha is checked by the correlation method function
 
         if num_features is None:
             num_features = self.num_features
-        # num_features is checked by the correlation method function
+
+        if not 0 <= alpha <= 1:
+            raise ValueError(f"alpha must be between 0 and 1, given {alpha}")
+
+        if num_features <= 0:
+            raise ValueError(f"num_features must be a positive integer, given {num_features}")
 
         # time_limit is checked by the LeapHybridCQMSampelr
 
@@ -275,8 +369,8 @@ def fit(
 
         if self.method == "correlation":
             cqm = self.correlation_cqm(X, y, num_features=num_features, alpha=alpha)
-        # elif self.method == "mutual information":
-        #     cqm = self.mutual_information_cqm(X, y, num_features=num_features)
+        elif self.method == "mutual_information":
+            cqm = self.mutual_information_cqm(X, y, num_features=num_features, alpha=alpha, **kwargs)
         else:
             raise ValueError(f"only methods {self.acceptable_methods} are implemented")
 
diff --git a/dwave/plugins/sklearn/utilities.py b/dwave/plugins/sklearn/utilities.py
index caccfcd..9670651 100644
--- a/dwave/plugins/sklearn/utilities.py
+++ b/dwave/plugins/sklearn/utilities.py
@@ -47,10 +47,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from itertools import combinations
 import typing
 
+from joblib import Parallel, delayed, cpu_count
 import numpy as np
 import numpy.typing as npt
+from scipy.sparse import issparse
+from scipy.special import digamma
+from scipy.stats import entropy
+from sklearn.feature_selection._mutual_info import _compute_mi, _iterate_columns
+from sklearn.neighbors import NearestNeighbors, KDTree
+from sklearn.preprocessing import scale
+from sklearn.utils import check_random_state
+from sklearn.utils.validation import check_array, check_X_y
 
 __all__ = ["corrcoef", "cov", "dot_2d"]
 
@@ -224,3 +234,267 @@ def dot_2d(a: npt.ArrayLike, b: npt.ArrayLike, *,
             out.flush()
 
     return out
+
+    
+def estimate_mi_matrix(    
+    X: npt.ArrayLike,
+    y: npt.ArrayLike,
+    discrete_features: typing.Union[str, npt.ArrayLike]="auto",
+    discrete_target: bool = False,
+    n_neighbors: int = 4,
+    conditional: bool = True,
+    copy: bool = True,
+    random_state: typing.Union[None, int] = None,
+    n_workers: int = 1,
+    ) -> npt.ArrayLike:
+    """
+    For the feature array `X` and the target array `y` computes
+    the matrix of (conditional) mutual information interactions. 
+    
+    cmi_{i, j} = I(x_i; y)
+    
+    If `conditional = True`, then the off-diagonal terms are computed:
+    
+    cmi_{i, j} = (I(x_i; y| x_j) + I(x_j; y| x_i)) / 2
+    
+    Otherwise
+    
+    cmi_{i, j} = I(x_i; x_j)
+    
+    Computation of I(x; y) uses the scikit-learn implementation, i.e.,
+    :func:`sklearn.feature_selection._mutual_info._estimate_mi`. The computation 
+    of I(x; y| z) is based on
+    https://github.com/jannisteunissen/mutual_information
+
+    Args:
+        X: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+        
+        y: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+        
+        conditional: bool, default=True
+            Whether to compute the off-diagonal terms using the conditional mutual
+            information or joint mutual information
+
+        discrete_features: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+        
+        discrete_target: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+        
+        n_neighbors: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+        
+        copy: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+        
+        random_state: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+
+        n_workers: int, default=1
+            Number of workers for parallel computation on the cpu       
+        
+    Returns:
+        mi_matrix : ndarray, shape (n_features, n_features)
+        Interaction matrix between the features using (conditional) mutual information.
+        A negative value will be replaced by 0.
+
+    References:    
+    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional
+           mutual information estimation for mixed, discrete and continuous
+           data." IEEE Transactions on Information Theory 67.1 (2020): 464-484.
+    """
+    
+    X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target)
+    n_samples, n_features = X.shape
+    
+    if isinstance(discrete_features, (str, bool)):
+        if isinstance(discrete_features, str):
+            if discrete_features == "auto":
+                discrete_features = issparse(X)
+            else:
+                raise ValueError("Invalid string value for discrete_features.")
+        discrete_mask = np.empty(n_features, dtype=bool)
+        discrete_mask.fill(discrete_features)
+    else:
+        discrete_features = check_array(discrete_features, ensure_2d=False)
+        if discrete_features.dtype != "bool":
+            discrete_mask = np.zeros(n_features, dtype=bool)
+            discrete_mask[discrete_features] = True
+        else:
+            discrete_mask = discrete_features
+
+    continuous_mask = ~discrete_mask
+    if np.any(continuous_mask) and issparse(X):
+        raise ValueError("Sparse matrix `X` can't have continuous features.")
+
+    rng = check_random_state(random_state)
+    if np.any(continuous_mask):
+        if copy:
+            X = X.copy()
+
+        X[:, continuous_mask] = scale(
+            X[:, continuous_mask], with_mean=False, copy=False
+        )
+
+        # Add small noise to continuous features as advised in Kraskov et. al.
+        X = X.astype(np.float64, copy=False)
+        means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))
+        X[:, continuous_mask] += (
+            1e-10
+            * means
+            * rng.standard_normal(size=(n_samples, np.sum(continuous_mask)))
+        )
+
+    if not discrete_target:
+        y = scale(y, with_mean=False)
+        y += (
+            1e-10
+            * np.maximum(1, np.mean(np.abs(y)))
+            * rng.standard_normal(size=n_samples)
+        )      
+    
+    n_features = X.shape[1]
+    max_n_workers = cpu_count()-1
+    if n_workers > max_n_workers:
+        n_workers = max_n_workers
+        Warning(f"Specified number of workers {n_workers} is larger than the number of cpus."
+                f"Will use only {max_n_workers}.")
+    
+    mi_matrix = np.zeros((n_features, n_features), dtype=np.float64)    
+
+    off_diagonal_vals = Parallel(n_jobs=n_workers)(
+        delayed(_compute_off_diagonal_mi)
+        (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors, conditional)
+        for (xi, discrete_feature_i), (xj, discrete_feature_j) in combinations(zip(_iterate_columns(X), discrete_mask), 2)
+        )
+    # keeping the discrete masks since the functions support it
+    diagonal_vals = Parallel(n_jobs=n_workers)(
+        delayed(_compute_mi)
+        (xi, y, discrete_feature_i, discrete_target, n_neighbors)
+        for xi, discrete_feature_i in zip(_iterate_columns(X), discrete_mask)
+        )
+    np.fill_diagonal(mi_matrix, diagonal_vals)
+    off_diagonal_val = iter(off_diagonal_vals)
+    if conditional:
+        # Although we need to compute `(I(X_j; Y| X_i) + I(X_i; Y| X_j))/2`
+        # we can avoid the computation of the second term using the formula:
+        # `I(X_j; Y| X_i) = I(X_i; Y| X_j) + I(X_j; Y) - I(X_i; Y)`
+        for (i, d_i), (j, d_j) in combinations(enumerate(diagonal_vals), 2):
+            val = next(off_diagonal_val)            
+            val += max(val + (d_j - d_i), 0)
+            mi_matrix[i, j] = mi_matrix[j, i] = val/2
+    else:
+        for i, j in combinations(range(n_features), 2):
+            mi_matrix[i, j] = mi_matrix[j, i] = next(off_diagonal_val)
+    return mi_matrix
+
+
+def _compute_off_diagonal_mi(
+    xi: npt.ArrayLike,
+    xj: npt.ArrayLike,
+    y: npt.ArrayLike,
+    discrete_feature_i: bool,
+    discrete_feature_j: bool,
+    discrete_target: bool,
+    n_neighbors: int = 4,
+    conditional: bool = True,
+    ):
+    """
+    Computing a distance `d` between features `x_i` and `x_j`.
+    If `conditional = True`, then conditional mutual infomation is used
+    `I(x_i; y | x_j)`.
+    
+    If `conditonal = False` then mutual information is used 
+    `I(x_i; x_j)`.    
+    
+    References
+    ----------
+    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional
+           mutual information estimation for mixed, discrete and continuous
+           data." IEEE Transactions on Information Theory 67.1 (2020): 464-484.
+    """
+    if conditional:
+        if discrete_feature_i and discrete_feature_j and discrete_target:
+            return _compute_cmi_d(xi, xj, y)
+        else:
+            # TODO: consider adding treatement of mixed discrete
+            # and continuous variables
+            return _compute_cmi_c(xi, xj, y, n_neighbors)
+    else:
+        return _compute_mi(xi, xj, discrete_feature_i, discrete_feature_j, n_neighbors)
+
+
+def _compute_cmi_d(
+    xi: npt.ArrayLike,
+    xj: npt.ArrayLike,
+    y: npt.ArrayLike):
+    """
+    Computes conditional mutual infomation `I(x_i; y | x_j)`.
+    All random variables `x_i`, `x_j` and `y` are discrete.
+    
+    Adpated from https://github.com/dwave-examples/mutual-information-feature-selection
+    """
+    
+    # Computing joint probability distribution for the features and the target
+    unique_labels = [np.hstack((np.unique(col), np.inf)) for col in (xi, xj, y)]
+    prob, _ = np.histogramdd(np.vstack((xi, xj, y)).T, bins=unique_labels)
+    
+    cmi_ij = (
+        entropy(prob.sum(axis=2).flatten()) # H(x_i, x_j)
+        +entropy(prob.sum(axis=0).flatten()) # H(x_j, y)
+        -entropy(prob.sum(axis=(0,2))) # H(x_j)
+        -entropy(prob.flatten()) # H(x_i, x_j, y)
+    )
+    return cmi_ij
+
+    
+def _compute_cmi_c(
+    xi: npt.ArrayLike,
+    xj: npt.ArrayLike,
+    y: npt.ArrayLike,
+    n_neighbors: int = 4):
+    """
+    Computes conditional mutual infomation `(I(x_i; y | x_j)`.
+    At least one random variables from `x_i`, `x_j` and `y` is continuous.
+    
+    Adapted from https://github.com/jannisteunissen/mutual_information
+    """
+    xi = xi.reshape((-1, 1))    
+    xj = xj.reshape((-1, 1))
+    y = y.reshape((-1, 1))
+    
+    # Here we rely on NearestNeighbors to select the fastest algorithm.
+    nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors)
+    nn.fit(np.hstack((xi, xj, y)))
+    radius = nn.kneighbors()[0]
+    radius = np.nextafter(radius[:, -1], 0)
+       
+    # KDTree is explicitly fit to allow for the querying of number of
+    # neighbors within a specified radius
+    n_j = _num_points_within_radius(xj, radius)
+    n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius)
+    n_jy = _num_points_within_radius(np.hstack((y, xj)), radius)
+
+    cmi_ij = (
+        digamma(n_neighbors)
+        +np.mean(digamma(n_j))
+        -np.mean(digamma(n_ij))        
+        -np.mean(digamma(n_jy)))
+    return max(0, cmi_ij)
+
+
+def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike):
+    """For each point, determine the number of other points within a given radius
+    Function from https://github.com/jannisteunissen/mutual_information
+    
+    :param x: ndarray, shape (n_samples, n_dim)
+    :param radius: radius, shape (n_samples,)
+    :returns: number of points within radius
+
+    """
+    kd = KDTree(x, metric="chebyshev")
+    nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
+    return np.array(nx)
diff --git a/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml b/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml
new file mode 100644
index 0000000..c3ed08c
--- /dev/null
+++ b/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml
@@ -0,0 +1,4 @@
+---
+features:
+  - |
+    Add feature selection method using (conditional) mutual information
diff --git a/tests/test_utilities.py b/tests/test_utilities.py
index 107f4d8..df2a35e 100644
--- a/tests/test_utilities.py
+++ b/tests/test_utilities.py
@@ -51,8 +51,10 @@
 
 import numpy as np
 
-from dwave.plugins.sklearn.utilities import corrcoef, cov, dot_2d
-
+from dwave.plugins.sklearn.utilities import corrcoef, cov, dot_2d, _compute_cmi_c, _compute_cmi_d
+from scipy.special import digamma
+from sklearn.feature_selection._mutual_info import _compute_mi
+from sklearn.neighbors import KDTree
 
 class TestCorrCoef(unittest.TestCase):
     def test_agreement(self):
@@ -178,3 +180,101 @@ def test_memmap(self):
                 out = np.memmap(fout, "float64", mode="w+", shape=(X.shape[0], X.shape[0]))
 
                 dot_2d(X, X.T, out=out)
+
+
+class TestMI(unittest.TestCase):
+    def test_cmi(self):
+        """
+        We test the algorithm using the formula `I(x;y|z) = I(x;y) + I(z;y) - I(z;y|x)`.
+        Since the mutual information data is an sklearn function,
+        :func:`sklearn.feature_selection._mutual_info._compute_mi`
+        it is highly likely that formula is valid only if our algorithm is correct.
+        For continous variables the formula may still break for some seeds
+        """
+        np.random.seed(42)
+        n_samples, n_neighbors = 4003, 4
+        # Test continuous implementation
+        Xy = np.random.randn(n_samples, 3)
+        cmi_ij = _compute_cmi_c(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors)
+        cmi_ji = _compute_cmi_c(Xy[:, 1], Xy[:, 0], Xy[:, 2], n_neighbors=n_neighbors)
+        mi_i = _compute_mi(Xy[:, 0], Xy[:, 2], False, False, n_neighbors=n_neighbors)
+        mi_j = _compute_mi(Xy[:, 1], Xy[:, 2], False, False, n_neighbors=n_neighbors)
+        self.assertAlmostEqual(
+            max(cmi_ij + mi_j - mi_i, 0), cmi_ji, places=3,
+            msg="The formula for continuous conditional mutual information is violated")
+
+        # Test discrete implementation       
+        n_samples, n_ss = 103, 51
+        xi = np.random.randint(0, 11, (n_samples, ))
+        xj = np.hstack((
+            np.random.randint(-2, 1, (n_ss, )),
+            np.random.randint(10, 14, (n_samples-n_ss, ))))
+        np.random.shuffle(xi)
+        np.random.shuffle(xj)
+        y = np.random.randint(-12, -10, (n_samples, ))
+        cmi_ij = _compute_cmi_d(xi, xj, y)
+        cmi_ji = _compute_cmi_d(xj, xi, y)
+        mi_i = _compute_mi(xi, y, True, True, n_neighbors=n_neighbors)
+        mi_j = _compute_mi(xj, y, True, True, n_neighbors=n_neighbors)
+        self.assertAlmostEqual(
+            cmi_ij + mi_j - mi_i, cmi_ji, places=5,
+            msg="The formula for discrete conditional mutual information is violated")
+
+    def test_implementation(self):
+        # methods from https://github.com/jannisteunissen/mutual_information
+        def _compute_cmi_t(x, z, y, n_neighbors):
+            n_samples = len(x)
+
+            x = x.reshape(-1, 1)
+            y = y.reshape(-1, 1)
+            z = z.reshape(-1, 1)
+            xyz = np.hstack((x, y, z))
+            k = np.full(n_samples, n_neighbors)
+            radius = get_radius_kneighbors(xyz, n_neighbors)
+
+            nxz = num_points_within_radius(np.hstack((x, z)), radius)
+            nyz = num_points_within_radius(np.hstack((y, z)), radius)
+            nz = num_points_within_radius(z, radius)
+
+            cmi = max(0, np.mean(digamma(k)) - np.mean(digamma(nxz + 1))
+                    - np.mean(digamma(nyz + 1)) + np.mean(digamma(nz + 1)))
+            return cmi
+
+        def get_radius_kneighbors(x, n_neighbors):
+            """Determine smallest radius around x containing n_neighbors neighbors
+
+            :param x: ndarray, shape (n_samples, n_dim)
+            :param n_neighbors: number of neighbors
+            :returns: radius, shape (n_samples,)
+            """
+            # Use KDTree for simplicity (sometimes a ball tree could be faster)
+            kd = KDTree(x, metric="chebyshev")
+
+            # Results include point itself, therefore n_neighbors+1
+            neigh_dist = kd.query(x, k=n_neighbors+1)[0]
+
+            # Take radius slightly larger than distance to last neighbor
+            radius = np.nextafter(neigh_dist[:, -1], 0)
+            return radius
+
+        def num_points_within_radius(x, radius):
+            """For each point, determine the number of other points within a given radius
+
+            :param x: ndarray, shape (n_samples, n_dim)
+            :param radius: radius, shape (n_samples,)
+            :returns: number of points within radius
+            """
+            kd = KDTree(x, metric="chebyshev")
+            nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
+            return np.array(nx) - 1.0
+
+        n_samples, n_neighbors = 103, 4
+        # Test continuous implementation
+        Xy = np.random.randn(n_samples, 3)
+        cmi_t = _compute_cmi_t(Xy[:,0], Xy[:,1], Xy[:,2], n_neighbors=n_neighbors)
+
+        cmi_c = _compute_cmi_c(Xy[:,0], Xy[:,1], Xy[:,2], n_neighbors=n_neighbors)
+
+        self.assertAlmostEqual(
+            cmi_t, cmi_c, places=5,
+            msg="The algorithm doesn't match the original implementation")

From 31ae832155019ef64c28a23914017f9b943332ae Mon Sep 17 00:00:00 2001
From: Aivar Sootla <asootla@dwavesys.com>
Date: Mon, 8 Apr 2024 15:23:43 +0100
Subject: [PATCH 04/12] add computations for mixed distributions, respond to PR
 comments

---
 dwave/plugins/sklearn/transformers.py | 218 ++++++++++++--
 dwave/plugins/sklearn/utilities.py    | 409 +++++++++++++++-----------
 requirements.txt                      |   3 +
 setup.cfg                             |   5 +-
 tests/test_transformer.py             |   8 +-
 tests/test_utilities.py               |  94 ++++--
 6 files changed, 505 insertions(+), 232 deletions(-)

diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py
index 9773e99..88dc149 100644
--- a/dwave/plugins/sklearn/transformers.py
+++ b/dwave/plugins/sklearn/transformers.py
@@ -14,23 +14,25 @@
 
 from __future__ import annotations
 
-import itertools
-import logging
+from itertools import combinations
 import tempfile
 import typing
-import warnings
 
 import dimod
-import numpy as np
-import numpy.typing as npt
-
+from dwave.plugins.sklearn.utilities import corrcoef, _compute_off_diagonal_mi, tqdm_joblib
 from dwave.cloud.exceptions import ConfigFileError, SolverAuthenticationError
 from dwave.system import LeapHybridCQMSampler
+from joblib import Parallel, delayed, cpu_count
+import numpy as np
+import numpy.typing as npt
+from scipy.sparse import issparse
 from sklearn.base import BaseEstimator
 from sklearn.feature_selection import SelectorMixin
-from sklearn.utils.validation import check_is_fitted
-
-from dwave.plugins.sklearn.utilities import corrcoef, estimate_mi_matrix
+from sklearn.feature_selection._mutual_info import _compute_mi, _iterate_columns
+from sklearn.preprocessing import scale
+from sklearn.utils import check_random_state
+from sklearn.utils.validation import check_is_fitted, check_array, check_X_y
+from tqdm import tqdm
 
 __all__ = ["SelectFromQuadraticModel"]
 
@@ -110,8 +112,8 @@ def _get_support_mask(self) -> np.ndarray[typing.Any, np.dtype[np.bool_]]:
         except AttributeError:
             raise RuntimeError("fit hasn't been run yet")
 
-    @staticmethod
     def correlation_cqm(
+        self,
         X: npt.ArrayLike,
         y: npt.ArrayLike,
         *,
@@ -152,6 +154,7 @@ def correlation_cqm(
             1QBit; White Paper.
             https://1qbit.com/whitepaper/optimal-feature-selection-in-credit-scoring-classification-using-quantum-annealer
         """
+        self._check_params(X, y, alpha, num_features)
 
         cqm = dimod.ConstrainedQuadraticModel()
         cqm.add_variables(dimod.BINARY, X.shape[1])
@@ -191,13 +194,12 @@ def correlation_cqm(
             np.fill_diagonal(correlations, correlations[:, -1] * (- alpha * num_features))
             # Note: we only add terms on and above the diagonal
             it = np.nditer(correlations[:-1, :-1], flags=['multi_index'], op_flags=[['readonly']])
-            print(correlations)
             cqm.set_objective((*it.multi_index, x) for x in it if x)
 
         return cqm
 
-    @staticmethod
-    def mutual_information_cqm(        
+    def mutual_information_cqm(
+        self,   
         X: npt.ArrayLike,
         y: npt.ArrayLike,
         *,
@@ -268,6 +270,8 @@ def mutual_information_cqm(
                Knowledge discovery and data mining, pages 512–521. ACM, 2014.
         """
         
+        self._check_params(X, y, alpha, num_features)
+        
         cqm = dimod.ConstrainedQuadraticModel()
         cqm.add_variables(dimod.BINARY, X.shape[1])
 
@@ -335,30 +339,18 @@ def fit(
             This instance of `SelectFromQuadraticModel`.
         """
         X = np.atleast_2d(np.asarray(X))
-        y = np.asarray(y)
-        
         if X.ndim != 2:
             raise ValueError("X must be a 2-dimensional array-like")
-        if y.ndim != 1:
-            raise ValueError("y must be a 1-dimensional array-like")
 
-        if y.shape[0] != X.shape[0]:
-            raise ValueError(f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}")
-
-        if X.shape[0] <= 1:
-            raise ValueError("X must have at least two rows") 
+        # y is checked by the correlation method function
 
         if alpha is None:
             alpha = self.alpha
+        # alpha is checked by the correlation method function
 
         if num_features is None:
             num_features = self.num_features
-
-        if not 0 <= alpha <= 1:
-            raise ValueError(f"alpha must be between 0 and 1, given {alpha}")
-
-        if num_features <= 0:
-            raise ValueError(f"num_features must be a positive integer, given {num_features}")
+        # num_features is checked by the correlation method function
 
         # time_limit is checked by the LeapHybridCQMSampelr
 
@@ -403,3 +395,173 @@ def fit(
     def unfit(self):
         """Undo a previously executed ``fit`` method."""
         del self._mask
+
+    @staticmethod
+    def _check_params(X: npt.ArrayLike,
+                      y: npt.ArrayLike,
+                      alpha: float,
+                      num_features: int):
+        X = np.atleast_2d(np.asarray(X))
+        y = np.asarray(y)
+        
+        if X.ndim != 2:
+            raise ValueError("X must be a 2-dimensional array-like")
+
+        if y.ndim != 1:
+            raise ValueError("y must be a 1-dimensional array-like")
+
+        if y.shape[0] != X.shape[0]:
+            raise ValueError(f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}")
+
+        if not 0 <= alpha <= 1:
+            raise ValueError(f"alpha must be between 0 and 1, given {alpha}")
+
+        if num_features <= 0:
+            raise ValueError(f"num_features must be a positive integer, given {num_features}")
+
+        if X.shape[0] <= 1:
+            raise ValueError("X must have at least two rows")
+
+
+def estimate_mi_matrix(    
+    X: npt.ArrayLike,
+    y: npt.ArrayLike,
+    discrete_features: typing.Union[str, npt.ArrayLike]="auto",
+    discrete_target: bool = False,
+    n_neighbors: int = 4,
+    conditional: bool = True,
+    copy: bool = True,
+    random_state: typing.Union[None, int] = None,
+    n_workers: int = 1,
+    n_subsample: int = -1
+    ) -> npt.ArrayLike:
+    """
+    For the feature array `X` and the target array `y` computes
+    the matrix of (conditional) mutual information interactions. 
+    
+    cmi_{i, j} = I(x_i; y)
+    
+    If `conditional = True`, then the off-diagonal terms are computed:
+    
+    cmi_{i, j} = (I(x_i; y| x_j) + I(x_j; y| x_i)) / 2
+    
+    Otherwise
+    
+    cmi_{i, j} = I(x_i; x_j)
+    
+    Computation of I(x; y) uses the scikit-learn implementation, i.e.,
+    :func:`sklearn.feature_selection._mutual_info._estimate_mi`. The computation 
+    of I(x; y| z) is based on
+    https://github.com/jannisteunissen/mutual_information
+
+    Args:
+        X: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+        
+        y: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+        
+        conditional: bool, default=True
+            Whether to compute the off-diagonal terms using the conditional mutual
+            information or joint mutual information
+
+        discrete_features: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+        
+        discrete_target: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+        
+        n_neighbors: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+        
+        copy: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+        
+        random_state: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+
+        n_workers: int, default=1
+            Number of workers for parallel computation on the cpu       
+        
+    Returns:
+        mi_matrix : ndarray, shape (n_features, n_features)
+        Interaction matrix between the features using (conditional) mutual information.
+        A negative value will be replaced by 0.
+
+    References:    
+    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional
+           mutual information estimation for mixed, discrete and continuous
+           data." IEEE Transactions on Information Theory 67.1 (2020): 464-484.
+    """
+    
+    X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target)
+    n_samples, n_features = X.shape
+    
+    if isinstance(discrete_features, (str, bool)):
+        if isinstance(discrete_features, str):
+            if discrete_features == "auto":
+                discrete_features = issparse(X)
+            else:
+                raise ValueError("Invalid string value for discrete_features.")
+        discrete_mask = np.empty(n_features, dtype=bool)
+        discrete_mask.fill(discrete_features)
+    else:
+        discrete_features = check_array(discrete_features, ensure_2d=False)
+        if discrete_features.dtype != "bool":
+            discrete_mask = np.zeros(n_features, dtype=bool)
+            discrete_mask[discrete_features] = True
+        else:
+            discrete_mask = discrete_features
+
+    continuous_mask = ~discrete_mask
+    if np.any(continuous_mask) and issparse(X):
+        raise ValueError("Sparse matrix `X` can't have continuous features.")
+
+    rng = check_random_state(random_state)
+    if np.any(continuous_mask):
+        if copy:
+            X = X.copy()
+
+        X[:, continuous_mask] = scale(
+            X[:, continuous_mask], with_mean=False, copy=False
+        )
+
+        # Add small noise to continuous features as advised in Kraskov et. al.
+        X = X.astype(np.float64, copy=False)
+        means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))
+        X[:, continuous_mask] += (
+            1e-10
+            * means
+            * rng.standard_normal(size=(n_samples, np.sum(continuous_mask)))
+        )
+
+    if not discrete_target:
+        y = scale(y, with_mean=False)
+        y += (
+            1e-10
+            * np.maximum(1, np.mean(np.abs(y)))
+            * rng.standard_normal(size=n_samples)
+        )      
+    
+    n_features = X.shape[1]
+    max_n_workers = cpu_count()-1
+    if n_workers > max_n_workers:
+        n_workers = max_n_workers
+        Warning(f"Specified number of workers {n_workers} is larger than the number of cpus."
+                f"Will use only {max_n_workers}.")
+    
+    mi_matrix = np.zeros((n_features, n_features), dtype=np.float64)    
+    with tqdm_joblib(tqdm(desc="Computing off-diagonal terms", total=len(discrete_mask) * (len(discrete_mask) - 1) // 2 )) as progress_bar:
+        off_diagonal_vals = Parallel(n_jobs=n_workers)(
+            delayed(_compute_off_diagonal_mi)
+            (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors, conditional, n_subsample)
+            for (xi, discrete_feature_i), (xj, discrete_feature_j) in combinations(zip(_iterate_columns(X), discrete_mask), 2)
+            )
+    diagonal_vals = Parallel(n_jobs=n_workers)(
+        delayed(_compute_mi)
+        (xi, y, discrete_feature_i, discrete_target, n_neighbors)
+        for xi, discrete_feature_i in zip(_iterate_columns(X), discrete_mask)
+        )
+    np.fill_diagonal(mi_matrix, diagonal_vals)
+    off_diagonal_val = iter(off_diagonal_vals)
+    for i, j in combinations(range(n_features), 2):
+        mi_matrix[i, j] = mi_matrix[j, i] = next(off_diagonal_val)
+    return mi_matrix
+
diff --git a/dwave/plugins/sklearn/utilities.py b/dwave/plugins/sklearn/utilities.py
index 9670651..5910b53 100644
--- a/dwave/plugins/sklearn/utilities.py
+++ b/dwave/plugins/sklearn/utilities.py
@@ -47,20 +47,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from itertools import combinations
 import typing
 
-from joblib import Parallel, delayed, cpu_count
+import contextlib
+import joblib
 import numpy as np
 import numpy.typing as npt
-from scipy.sparse import issparse
 from scipy.special import digamma
 from scipy.stats import entropy
-from sklearn.feature_selection._mutual_info import _compute_mi, _iterate_columns
+from sklearn.feature_selection._mutual_info import _compute_mi
 from sklearn.neighbors import NearestNeighbors, KDTree
-from sklearn.preprocessing import scale
-from sklearn.utils import check_random_state
-from sklearn.utils.validation import check_array, check_X_y
 
 __all__ = ["corrcoef", "cov", "dot_2d"]
 
@@ -235,158 +231,6 @@ def dot_2d(a: npt.ArrayLike, b: npt.ArrayLike, *,
 
     return out
 
-    
-def estimate_mi_matrix(    
-    X: npt.ArrayLike,
-    y: npt.ArrayLike,
-    discrete_features: typing.Union[str, npt.ArrayLike]="auto",
-    discrete_target: bool = False,
-    n_neighbors: int = 4,
-    conditional: bool = True,
-    copy: bool = True,
-    random_state: typing.Union[None, int] = None,
-    n_workers: int = 1,
-    ) -> npt.ArrayLike:
-    """
-    For the feature array `X` and the target array `y` computes
-    the matrix of (conditional) mutual information interactions. 
-    
-    cmi_{i, j} = I(x_i; y)
-    
-    If `conditional = True`, then the off-diagonal terms are computed:
-    
-    cmi_{i, j} = (I(x_i; y| x_j) + I(x_j; y| x_i)) / 2
-    
-    Otherwise
-    
-    cmi_{i, j} = I(x_i; x_j)
-    
-    Computation of I(x; y) uses the scikit-learn implementation, i.e.,
-    :func:`sklearn.feature_selection._mutual_info._estimate_mi`. The computation 
-    of I(x; y| z) is based on
-    https://github.com/jannisteunissen/mutual_information
-
-    Args:
-        X: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
-        
-        y: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
-        
-        conditional: bool, default=True
-            Whether to compute the off-diagonal terms using the conditional mutual
-            information or joint mutual information
-
-        discrete_features: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
-        
-        discrete_target: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
-        
-        n_neighbors: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
-        
-        copy: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
-        
-        random_state: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
-
-        n_workers: int, default=1
-            Number of workers for parallel computation on the cpu       
-        
-    Returns:
-        mi_matrix : ndarray, shape (n_features, n_features)
-        Interaction matrix between the features using (conditional) mutual information.
-        A negative value will be replaced by 0.
-
-    References:    
-    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
-           information". Phys. Rev. E 69, 2004.
-    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
-           Data Sets". PLoS ONE 9(2), 2014.
-    .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional
-           mutual information estimation for mixed, discrete and continuous
-           data." IEEE Transactions on Information Theory 67.1 (2020): 464-484.
-    """
-    
-    X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target)
-    n_samples, n_features = X.shape
-    
-    if isinstance(discrete_features, (str, bool)):
-        if isinstance(discrete_features, str):
-            if discrete_features == "auto":
-                discrete_features = issparse(X)
-            else:
-                raise ValueError("Invalid string value for discrete_features.")
-        discrete_mask = np.empty(n_features, dtype=bool)
-        discrete_mask.fill(discrete_features)
-    else:
-        discrete_features = check_array(discrete_features, ensure_2d=False)
-        if discrete_features.dtype != "bool":
-            discrete_mask = np.zeros(n_features, dtype=bool)
-            discrete_mask[discrete_features] = True
-        else:
-            discrete_mask = discrete_features
-
-    continuous_mask = ~discrete_mask
-    if np.any(continuous_mask) and issparse(X):
-        raise ValueError("Sparse matrix `X` can't have continuous features.")
-
-    rng = check_random_state(random_state)
-    if np.any(continuous_mask):
-        if copy:
-            X = X.copy()
-
-        X[:, continuous_mask] = scale(
-            X[:, continuous_mask], with_mean=False, copy=False
-        )
-
-        # Add small noise to continuous features as advised in Kraskov et. al.
-        X = X.astype(np.float64, copy=False)
-        means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))
-        X[:, continuous_mask] += (
-            1e-10
-            * means
-            * rng.standard_normal(size=(n_samples, np.sum(continuous_mask)))
-        )
-
-    if not discrete_target:
-        y = scale(y, with_mean=False)
-        y += (
-            1e-10
-            * np.maximum(1, np.mean(np.abs(y)))
-            * rng.standard_normal(size=n_samples)
-        )      
-    
-    n_features = X.shape[1]
-    max_n_workers = cpu_count()-1
-    if n_workers > max_n_workers:
-        n_workers = max_n_workers
-        Warning(f"Specified number of workers {n_workers} is larger than the number of cpus."
-                f"Will use only {max_n_workers}.")
-    
-    mi_matrix = np.zeros((n_features, n_features), dtype=np.float64)    
-
-    off_diagonal_vals = Parallel(n_jobs=n_workers)(
-        delayed(_compute_off_diagonal_mi)
-        (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors, conditional)
-        for (xi, discrete_feature_i), (xj, discrete_feature_j) in combinations(zip(_iterate_columns(X), discrete_mask), 2)
-        )
-    # keeping the discrete masks since the functions support it
-    diagonal_vals = Parallel(n_jobs=n_workers)(
-        delayed(_compute_mi)
-        (xi, y, discrete_feature_i, discrete_target, n_neighbors)
-        for xi, discrete_feature_i in zip(_iterate_columns(X), discrete_mask)
-        )
-    np.fill_diagonal(mi_matrix, diagonal_vals)
-    off_diagonal_val = iter(off_diagonal_vals)
-    if conditional:
-        # Although we need to compute `(I(X_j; Y| X_i) + I(X_i; Y| X_j))/2`
-        # we can avoid the computation of the second term using the formula:
-        # `I(X_j; Y| X_i) = I(X_i; Y| X_j) + I(X_j; Y) - I(X_i; Y)`
-        for (i, d_i), (j, d_j) in combinations(enumerate(diagonal_vals), 2):
-            val = next(off_diagonal_val)            
-            val += max(val + (d_j - d_i), 0)
-            mi_matrix[i, j] = mi_matrix[j, i] = val/2
-    else:
-        for i, j in combinations(range(n_features), 2):
-            mi_matrix[i, j] = mi_matrix[j, i] = next(off_diagonal_val)
-    return mi_matrix
-
 
 def _compute_off_diagonal_mi(
     xi: npt.ArrayLike,
@@ -397,6 +241,7 @@ def _compute_off_diagonal_mi(
     discrete_target: bool,
     n_neighbors: int = 4,
     conditional: bool = True,
+    n_subsample: int = -1
     ):
     """
     Computing a distance `d` between features `x_i` and `x_j`.
@@ -417,48 +262,66 @@ def _compute_off_diagonal_mi(
            data." IEEE Transactions on Information Theory 67.1 (2020): 464-484.
     """
     if conditional:
+        if n_subsample > 0:
+            idx = np.arange(xi.shape[0])
+            np.random.shuffle(idx)
+            idx = idx[:n_subsample]
+            xi, xj, y = xi[idx], xj[idx], y[idx]
         if discrete_feature_i and discrete_feature_j and discrete_target:
-            return _compute_cmi_d(xi, xj, y)
+            val = _compute_cmip_d(xi, xj, y)
+        elif n_subsample >= 0 and discrete_target:            
+            val = _compute_cmip_ccd(xi, xj, y, n_neighbors)            
+        elif n_subsample >= 0 and discrete_feature_i and (not discrete_target):
+            val = _compute_cmip_cdc(xj, xi, y, n_neighbors)
+        elif n_subsample >= 0 and (not discrete_feature_i) and discrete_feature_j and (not discrete_target):
+            val = _compute_cmip_cdc(xi, xj, y, n_neighbors)
         else:
-            # TODO: consider adding treatement of mixed discrete
-            # and continuous variables
-            return _compute_cmi_c(xi, xj, y, n_neighbors)
+            val = _compute_cmip_c(xi, xj, y, n_neighbors)
+        return sum(val) / 2
     else:
         return _compute_mi(xi, xj, discrete_feature_i, discrete_feature_j, n_neighbors)
 
 
-def _compute_cmi_d(
+def _compute_cmip_d(
     xi: npt.ArrayLike,
     xj: npt.ArrayLike,
-    y: npt.ArrayLike):
+    y: npt.ArrayLike) -> typing.Tuple[float]:
     """
-    Computes conditional mutual infomation `I(x_i; y | x_j)`.
+    Computes conditional mutual infomation pair `I(x_i; y | x_j)` `I(x_j; y | x_i)`.
     All random variables `x_i`, `x_j` and `y` are discrete.
     
     Adpated from https://github.com/dwave-examples/mutual-information-feature-selection
     """
     
     # Computing joint probability distribution for the features and the target
-    unique_labels = [np.hstack((np.unique(col), np.inf)) for col in (xi, xj, y)]
-    prob, _ = np.histogramdd(np.vstack((xi, xj, y)).T, bins=unique_labels)
-    
+    bin_boundaries = [np.hstack((np.unique(col), np.inf)) for col in (xi, xj, y)]
+    prob, _ = np.histogramdd(np.vstack((xi, xj, y)).T, bins=bin_boundaries)
+    Hijy = entropy(prob.flatten())   
+    Hij = entropy(prob.sum(axis=2).flatten())
     cmi_ij = (
-        entropy(prob.sum(axis=2).flatten()) # H(x_i, x_j)
+        Hij # H(x_i, x_j)
         +entropy(prob.sum(axis=0).flatten()) # H(x_j, y)
         -entropy(prob.sum(axis=(0,2))) # H(x_j)
-        -entropy(prob.flatten()) # H(x_i, x_j, y)
+        -Hijy # H(x_i, x_j, y)
+    )
+    cmi_ji = (
+        Hij # H(x_i, x_j)
+        +entropy(prob.sum(axis=1).flatten()) # H(x_i, y)
+        -entropy(prob.sum(axis=(1,2))) # H(x_i)
+        -Hijy # H(x_i, x_j, y)        
     )
-    return cmi_ij
+    return cmi_ij, cmi_ji
 
-    
-def _compute_cmi_c(
+
+def _compute_cmip_c(
     xi: npt.ArrayLike,
     xj: npt.ArrayLike,
     y: npt.ArrayLike,
-    n_neighbors: int = 4):
+    n_neighbors: int = 4) -> typing.Tuple[float]:
     """
-    Computes conditional mutual infomation `(I(x_i; y | x_j)`.
-    At least one random variables from `x_i`, `x_j` and `y` is continuous.
+    Computes conditional mutual information pair `(I(x_i; y | x_j)`
+    and `(I(x_j; y | x_i)`. All random variables `x_i`, `x_j` and `y` 
+    are assumed to be continuous.
     
     Adapted from https://github.com/jannisteunissen/mutual_information
     """
@@ -478,12 +341,163 @@ def _compute_cmi_c(
     n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius)
     n_jy = _num_points_within_radius(np.hstack((y, xj)), radius)
 
+    n_i = _num_points_within_radius(xi, radius)
+    n_iy = _num_points_within_radius(np.hstack((y, xi)), radius)
+
     cmi_ij = (
         digamma(n_neighbors)
         +np.mean(digamma(n_j))
         -np.mean(digamma(n_ij))        
         -np.mean(digamma(n_jy)))
-    return max(0, cmi_ij)
+
+    cmi_ji = (
+        digamma(n_neighbors)
+        +np.mean(digamma(n_i))
+        -np.mean(digamma(n_ij))        
+        -np.mean(digamma(n_iy)))
+    return max(0, cmi_ij), max(0, cmi_ji)
+
+
+def _compute_cmip_ccd(
+    xi: npt.ArrayLike,
+    xj: npt.ArrayLike,
+    y: npt.ArrayLike,
+    n_neighbors: int = 4) -> typing.Tuple[float]:
+    """
+    Computes conditional mutual information pair `(I(x_i; y | x_j)`
+    and `(I(x_j; y | x_i)`. Random variables `x_i`, `x_j` are assumed
+    to be continuous, while `y` is assumed to be discrete.
+    
+    Adapted from https://github.com/jannisteunissen/mutual_information
+    """
+    xi = xi.reshape((-1, 1))    
+    xj = xj.reshape((-1, 1))
+
+    (radius,
+     label_counts,
+     k_all) = _get_radius_k_neighbours_d(
+         xi, xj, y,
+         n_neighbors=n_neighbors)
+
+    # Ignore points with unique labels.
+    mask = label_counts > 1
+    label_counts = label_counts[mask]
+    k_all = k_all[mask]
+    xi = xi[mask]
+    xj = xj[mask]
+    y = y[mask]
+    radius = radius[mask]
+    # KDTree is explicitly fit to allow for the querying of number of
+    # neighbors within a specified radius
+    
+    # continuous 
+    n_i = _num_points_within_radius(xi, radius)
+    n_j = _num_points_within_radius(xj, radius)
+    n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius)
+    
+    # mixed estimates    
+    n_iy = _num_points_within_radius_cd(xi, y, radius)
+    n_jy = _num_points_within_radius_cd(xj, y, radius)
+    
+    common_terms = (
+        np.mean(digamma(k_all))
+        -np.mean(digamma(n_ij)))
+    cmi_ij = (
+        common_terms
+        +np.mean(digamma(n_j))
+        -np.mean(digamma(n_jy)))
+    cmi_ji = (
+        common_terms
+        +np.mean(digamma(n_i))
+        -np.mean(digamma(n_iy)))
+    return max(0, cmi_ij), max(0, cmi_ji)
+
+
+def _compute_cmip_cdc(
+    xi: npt.ArrayLike,
+    xj: npt.ArrayLike,
+    y: npt.ArrayLike,
+    n_neighbors: int = 4) -> typing.Tuple[float]:
+    """
+    Computes conditional mutual information pair `(I(x_i; y | x_j)`
+    and `(I(x_j; y | x_i)`. Random variables `x_i`, `y` are assumed
+    to be continuous, while `x_j` is assumed to be discrete.
+    
+    Adapted from https://github.com/jannisteunissen/mutual_information
+    """    
+    xi = xi.reshape((-1, 1))
+    y = y.reshape((-1, 1))
+
+    (radius,
+     label_counts,
+     k_all) = _get_radius_k_neighbours_d(
+         xi, y, xj,
+         n_neighbors=n_neighbors)
+
+    # Ignore points with unique labels.
+    mask = label_counts > 1
+    label_counts = label_counts[mask]
+    k_all = k_all[mask]
+    xi = xi[mask]
+    xj = xj[mask]
+    y = y[mask]
+    radius = radius[mask]
+    
+    # KDTree is explicitly fit to allow for the querying of number of
+    # neighbors within a specified radius
+    
+    # continuous 
+    n_i = _num_points_within_radius(xi, radius)
+    n_iy = _num_points_within_radius(np.hstack((xi, y)), radius)
+    
+    # mixed estimates
+    n_ij = _num_points_within_radius_cd(xi, xj, radius)
+    n_jy = _num_points_within_radius_cd(y, xj, radius)
+    
+    # discrete estimates
+    n_j = label_counts
+    
+    common_terms = (
+        np.mean(digamma(k_all))
+        -np.mean(digamma(n_ij)))
+    cmi_ij = (
+        common_terms
+        +np.mean(digamma(n_j))
+        -np.mean(digamma(n_jy)))
+    cmi_ji = (
+        common_terms
+        +np.mean(digamma(n_i))
+        -np.mean(digamma(n_iy)))
+    return max(0, cmi_ij), max(0, cmi_ji)
+
+
+def _get_radius_k_neighbours_d(
+    c1: npt.ArrayLike,
+    c2: npt.ArrayLike,
+    d: npt.ArrayLike,
+    n_neighbors: int = 4
+    ) -> typing.Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]:
+
+    c1 = c1.reshape((-1, 1))
+    c2 = c2.reshape((-1, 1))
+    n_samples = c1.shape[0]
+    radius = np.empty(n_samples)
+    label_counts = np.empty(n_samples)
+    k_all = np.empty(n_samples)
+    nn = NearestNeighbors(metric="chebyshev")
+    
+    for label in np.unique(d):
+        mask = d == label        
+        count = np.sum(mask)
+        if count > 1:
+            k = min(n_neighbors, count - 1)
+            nn.set_params(n_neighbors=k)
+            nn.fit(np.hstack((c1[mask], c2[mask])))
+            r = nn.kneighbors()[0]
+            radius[mask] = np.nextafter(r[:, -1], 0)
+            k_all[mask] = k
+        label_counts[mask] = np.sum(mask)        
+    return radius, label_counts, k_all
 
 
 def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike):
@@ -498,3 +512,42 @@ def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike):
     kd = KDTree(x, metric="chebyshev")
     nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
     return np.array(nx)
+
+
+def _num_points_within_radius_cd(
+    c: npt.ArrayLike,
+    d: npt.ArrayLike,
+    radius: npt.ArrayLike) -> npt.ArrayLike:
+    """
+    For each point, determine the number of other points within a given radius
+    Inspired by _compute_mi_cd
+    """
+    c = c.reshape((-1, 1))
+    n_samples = c.shape[0]
+    m_all = np.empty(n_samples)
+    for label in np.unique(d):
+        mask = d == label
+        kd = KDTree(c[mask], metric="chebyshev")
+        m_all[mask] = kd.query_radius(
+            c[mask], radius[mask],
+            count_only=True, return_distance=False)
+    return m_all
+
+
+@contextlib.contextmanager
+def tqdm_joblib(tqdm_object):
+    """Context manager to patch joblib to report into tqdm progress bar given as argument
+    from  https://stackoverflow.com/a/58936697/5133167 
+    """
+    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
+        def __call__(self, *args, **kwargs):
+            tqdm_object.update(n=self.batch_size)
+            return super().__call__(*args, **kwargs)
+
+    old_batch_callback = joblib.parallel.BatchCompletionCallBack
+    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
+    try:
+        yield tqdm_object
+    finally:
+        joblib.parallel.BatchCompletionCallBack = old_batch_callback
+        tqdm_object.close()
diff --git a/requirements.txt b/requirements.txt
index 00814cb..43cc4f6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,5 +7,8 @@ oldest-supported-numpy; python_version>="3.11.0"
 
 scikit-learn==1.2.0; python_version >= '3.8'
 scikit-learn==1.0.2; python_version < '3.8'
+joblib>=0.11
+scipy>=1.7.3
+tqdm>=4.65.2
 
 reno==3.5.0
diff --git a/setup.cfg b/setup.cfg
index 269ee72..e84055b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -23,7 +23,10 @@ install_requires =
     dimod>=0.12.3
     dwave-system>=1.18.0
     numpy>=1.20.0
-    scikit-learn>=1.0.2 
+    scikit-learn>=1.0.2
+    scipy>=1.7.3
+    joblib>=0.11
+    tqdm>=4.65.0
 packages = 
     dwave
     dwave.plugins
diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index 6e0ef3e..abfa25f 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -122,12 +122,12 @@ def test_pipeline(self):
 
         clf.predict(X)
 
-    def test_alpha_0(self):
-        cqm = SelectFromQuadraticModel.correlation_cqm(self.X, self.y, num_features=3, alpha=0)
+    def test_alpha_0(self):        
+        cqm = SelectFromQuadraticModel().correlation_cqm(X=self.X, y=self.y, num_features=3, alpha=0)
         self.assertTrue(not any(cqm.objective.linear.values()))
 
     def test_alpha_1_no_quadratic(self):
-        cqm = SelectFromQuadraticModel.correlation_cqm(self.X, self.y, num_features=3, alpha=1)
+        cqm = SelectFromQuadraticModel().correlation_cqm(X=self.X, y=self.y, num_features=3, alpha=1)
         self.assertTrue(not any(cqm.objective.quadratic.values()))
         
     def test_alpha_1(self):
@@ -183,7 +183,7 @@ def test_fixed_column(self):
         X[:, 1] = 0
         X[:, 5] = 1
 
-        cqm = SelectFromQuadraticModel.correlation_cqm(X, self.y, alpha=.5, num_features=5)
+        cqm = SelectFromQuadraticModel().correlation_cqm(X=X, y=self.y, alpha=.5, num_features=5)
 
         # in this case the linear bias for those two columns should be 0
         self.assertEqual(cqm.objective.linear[1], 0)
diff --git a/tests/test_utilities.py b/tests/test_utilities.py
index df2a35e..aadaaef 100644
--- a/tests/test_utilities.py
+++ b/tests/test_utilities.py
@@ -51,7 +51,11 @@
 
 import numpy as np
 
-from dwave.plugins.sklearn.utilities import corrcoef, cov, dot_2d, _compute_cmi_c, _compute_cmi_d
+from dwave.plugins.sklearn.utilities import (
+    corrcoef, cov, dot_2d, 
+    _compute_cmip_c, _compute_cmip_d, 
+    _compute_cmip_cdc, _compute_cmip_ccd
+)
 from scipy.special import digamma
 from sklearn.feature_selection._mutual_info import _compute_mi
 from sklearn.neighbors import KDTree
@@ -183,28 +187,60 @@ def test_memmap(self):
 
 
 class TestMI(unittest.TestCase):
+    def test_cmi_cdc(self):
+        np.random.seed(42)        
+        n_samples, n_neighbors = 4003, 4
+        c1 = np.random.randn(n_samples,)
+        n_ss = 1001
+        d1 = np.hstack((
+            np.random.randint(-2, 1, (n_ss, )),
+            np.random.randint(10, 14, (n_samples-n_ss, ))))
+        d2 = np.random.randint(0, 4, (n_samples, ))
+        np.random.shuffle(d2)
+        cmi_ij_pair = _compute_cmip_c(c1, d1, d2, n_neighbors=n_neighbors)
+        cmi_ij_pair_cdc = _compute_cmip_cdc(c1, d1, d2, n_neighbors=n_neighbors)
+        cmi_ij_pair_ccd = _compute_cmip_ccd(c1, d1, d2, n_neighbors=n_neighbors)
+        self.assertAlmostEqual(
+            sum(cmi_ij_pair), sum(cmi_ij_pair_cdc), places=4,
+            msg="Computation for mixed continuous/discrete conditional mutual information is not accurate")
+        self.assertAlmostEqual(
+            sum(cmi_ij_pair), sum(cmi_ij_pair_ccd), places=4,
+            msg="Computation for mixed continuous/discrete conditional mutual information is not accurate")
+
+    def test_cmi_symmetry(self):
+        np.random.seed(42)        
+        n_samples, n_neighbors = 4003, 4
+        # Test continuous implementation
+        Xy = np.random.randn(n_samples, 3)
+        cmi_ij_pair = _compute_cmip_c(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors)
+        cmi_ji_pair = _compute_cmip_c(Xy[:, 1], Xy[:, 0], Xy[:, 2], n_neighbors=n_neighbors)
+        self.assertAlmostEqual(
+            sum(cmi_ij_pair), sum(cmi_ji_pair), places=3,
+            msg="Computation for continuous conditional mutual information is not symmetric")
+
+        c1 = np.random.randn(n_samples,)
+        c2 = np.random.randn(n_samples,)
+        n_ss = 1001
+        d = np.hstack((
+            np.random.randint(-2, 1, (n_ss, )),
+            np.random.randint(10, 14, (n_samples-n_ss, ))))
+        np.random.shuffle(d)
+        cmi_ij_pair = _compute_cmip_ccd(c1, c2, d, n_neighbors=n_neighbors)
+        cmi_ji_pair = _compute_cmip_ccd(c2, c1, d, n_neighbors=n_neighbors)
+        self.assertAlmostEqual(
+            sum(cmi_ij_pair), sum(cmi_ji_pair), places=3,
+            msg="Computation for mixed continuous/discrete conditional mutual information is not symmetric")        
+
     def test_cmi(self):
         """
         We test the algorithm using the formula `I(x;y|z) = I(x;y) + I(z;y) - I(z;y|x)`.
         Since the mutual information data is an sklearn function,
         :func:`sklearn.feature_selection._mutual_info._compute_mi`
         it is highly likely that formula is valid only if our algorithm is correct.
-        For continous variables the formula may still break for some seeds
         """
         np.random.seed(42)
-        n_samples, n_neighbors = 4003, 4
-        # Test continuous implementation
-        Xy = np.random.randn(n_samples, 3)
-        cmi_ij = _compute_cmi_c(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors)
-        cmi_ji = _compute_cmi_c(Xy[:, 1], Xy[:, 0], Xy[:, 2], n_neighbors=n_neighbors)
-        mi_i = _compute_mi(Xy[:, 0], Xy[:, 2], False, False, n_neighbors=n_neighbors)
-        mi_j = _compute_mi(Xy[:, 1], Xy[:, 2], False, False, n_neighbors=n_neighbors)
-        self.assertAlmostEqual(
-            max(cmi_ij + mi_j - mi_i, 0), cmi_ji, places=3,
-            msg="The formula for continuous conditional mutual information is violated")
-
         # Test discrete implementation       
-        n_samples, n_ss = 103, 51
+        n_samples, n_ss, n_neighbors = 103, 51, 4
         xi = np.random.randint(0, 11, (n_samples, ))
         xj = np.hstack((
             np.random.randint(-2, 1, (n_ss, )),
@@ -212,13 +248,16 @@ def test_cmi(self):
         np.random.shuffle(xi)
         np.random.shuffle(xj)
         y = np.random.randint(-12, -10, (n_samples, ))
-        cmi_ij = _compute_cmi_d(xi, xj, y)
-        cmi_ji = _compute_cmi_d(xj, xi, y)
+        cmi_ij, cmi_ji = _compute_cmip_d(xi, xj, y)
         mi_i = _compute_mi(xi, y, True, True, n_neighbors=n_neighbors)
         mi_j = _compute_mi(xj, y, True, True, n_neighbors=n_neighbors)
         self.assertAlmostEqual(
             cmi_ij + mi_j - mi_i, cmi_ji, places=5,
             msg="The formula for discrete conditional mutual information is violated")
+        cmi_ij2 = _compute_cmip_d(xj, xi, y)
+        self.assertAlmostEqual(
+            sum(cmi_ij2), cmi_ji+cmi_ij, places=5,
+            msg="Discrete conditional mutual information computation is not symmetric")
 
     def test_implementation(self):
         # methods from https://github.com/jannisteunissen/mutual_information
@@ -231,7 +270,14 @@ def _compute_cmi_t(x, z, y, n_neighbors):
             xyz = np.hstack((x, y, z))
             k = np.full(n_samples, n_neighbors)
             radius = get_radius_kneighbors(xyz, n_neighbors)
-
+            
+            mask = (radius == 0)
+            if mask.sum() > 0:
+                vals, ix, counts = np.unique(xyz[mask], axis=0,
+                                        return_inverse=True,
+                                        return_counts=True)
+                k[mask] = counts[ix] - 1
+            
             nxz = num_points_within_radius(np.hstack((x, z)), radius)
             nyz = num_points_within_radius(np.hstack((y, z)), radius)
             nz = num_points_within_radius(z, radius)
@@ -267,14 +313,20 @@ def num_points_within_radius(x, radius):
             kd = KDTree(x, metric="chebyshev")
             nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
             return np.array(nx) - 1.0
-
+        
+        np.random.seed(42)
         n_samples, n_neighbors = 103, 4
         # Test continuous implementation
         Xy = np.random.randn(n_samples, 3)
-        cmi_t = _compute_cmi_t(Xy[:,0], Xy[:,1], Xy[:,2], n_neighbors=n_neighbors)
+        cmi_t_01 = _compute_cmi_t(Xy[:,0], Xy[:,1], Xy[:,2], n_neighbors=n_neighbors)
+        cmi_t_10 = _compute_cmi_t(Xy[:,1], Xy[:,0], Xy[:,2], n_neighbors=n_neighbors)
 
-        cmi_c = _compute_cmi_c(Xy[:,0], Xy[:,1], Xy[:,2], n_neighbors=n_neighbors)
+        cmi_ij, cmi_ji = _compute_cmip_c(Xy[:,0], Xy[:,1], Xy[:,2], n_neighbors=n_neighbors)
+
+        self.assertAlmostEqual(
+            max(cmi_t_01, 0), cmi_ij, places=5,
+            msg="The algorithm doesn't match the original implementation")
 
         self.assertAlmostEqual(
-            cmi_t, cmi_c, places=5,
+            max(cmi_t_10, 0), cmi_ji, places=5,
             msg="The algorithm doesn't match the original implementation")

From 93b8b88717320131c1de12daf0f5c37692998d63 Mon Sep 17 00:00:00 2001
From: Aivar Sootla <asootla@dwavesys.com>
Date: Wed, 10 Apr 2024 17:12:59 +0100
Subject: [PATCH 05/12] addressing the first pass of the PR comments

---
 dwave/plugins/sklearn/transformers.py | 139 ++++++++++++++------------
 dwave/plugins/sklearn/utilities.py    | 131 +++++++++++++++---------
 requirements.txt                      |   5 +-
 setup.cfg                             |   6 +-
 tests/test_utilities.py               |  88 ++++++++++++++--
 5 files changed, 235 insertions(+), 134 deletions(-)

diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py
index 88dc149..1333e00 100644
--- a/dwave/plugins/sklearn/transformers.py
+++ b/dwave/plugins/sklearn/transformers.py
@@ -14,25 +14,29 @@
 
 from __future__ import annotations
 
+from inspect import signature
 from itertools import combinations
 import tempfile
 import typing
 
 import dimod
-from dwave.plugins.sklearn.utilities import corrcoef, _compute_off_diagonal_mi, tqdm_joblib
 from dwave.cloud.exceptions import ConfigFileError, SolverAuthenticationError
+from dwave.plugins.sklearn.utilities import (
+    corrcoef,
+    _compute_off_diagonal_cmi,
+    _compute_off_diagonal_mi,
+    _iterate_columns)
 from dwave.system import LeapHybridCQMSampler
-from joblib import Parallel, delayed, cpu_count
 import numpy as np
 import numpy.typing as npt
 from scipy.sparse import issparse
 from sklearn.base import BaseEstimator
 from sklearn.feature_selection import SelectorMixin
-from sklearn.feature_selection._mutual_info import _compute_mi, _iterate_columns
+from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
 from sklearn.preprocessing import scale
 from sklearn.utils import check_random_state
+from sklearn.utils.parallel import Parallel, delayed
 from sklearn.utils.validation import check_is_fitted, check_array, check_X_y
-from tqdm import tqdm
 
 __all__ = ["SelectFromQuadraticModel"]
 
@@ -211,7 +215,7 @@ def mutual_information_cqm(
         discrete_target: bool = False,
         copy: bool = True,
         n_neighbors: int = 4,
-        n_workers: int = 1,
+        n_jobs: typing.Union[None, int] = None,
         random_state: typing.Union[None, int] = None,
         ) -> dimod.ConstrainedQuadraticModel:
         """Build a constrained quadratic model for feature selection.
@@ -255,8 +259,12 @@ def mutual_information_cqm(
                 See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
             random_state:
                 See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
-            n_workers: int, default=1
-                Number of workers for parallel computation on the cpu    
+            n_jobs: int, default=None
+                The number of parallel jobs to run for the conditional mutual information
+                computation. The parallelization is over the columns of `X`.
+                ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+                ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+                for more details.
 
         Returns:
             A constrained quadratic model.
@@ -284,20 +292,24 @@ def mutual_information_cqm(
             )
 
         mi = estimate_mi_matrix(
-            X, y, discrete_features, discrete_target,
+            X, y, 
+            discrete_features=discrete_features, 
+            discrete_target=discrete_target,
             n_neighbors=n_neighbors, copy=copy,
-            random_state=random_state, n_workers=n_workers,
+            random_state=random_state, n_jobs=n_jobs,
             conditional=conditional)
         
-        if not conditional:
-            # mutliplying all features with num_features
-            np.multiply(mi, num_features, out=mi)
-            # mutpliypling off-diagonal ones with -(1-alpha)
-            np.multiply(mi, -(1 - alpha), out=mi)
+        if conditional:            
+            np.multiply(mi, -1, out=mi)
+        else:
+            # mutpliypling off-diagonal ones with -1
+            diagonal = -np.diag(mi).copy()
             # mutpliypling off-diagonal ones with alpha
-            diagonal = alpha * np.diag(mi)
+            np.multiply(mi, alpha, out=mi)            
             np.fill_diagonal(mi, diagonal)
         
+        self.mi_matrix = mi
+        
         it = np.nditer(mi, flags=['multi_index'], op_flags=[['readonly']])
         cqm.set_objective((*it.multi_index, x) for x in it if x)
         return cqm        
@@ -362,7 +374,8 @@ def fit(
         if self.method == "correlation":
             cqm = self.correlation_cqm(X, y, num_features=num_features, alpha=alpha)
         elif self.method == "mutual_information":
-            cqm = self.mutual_information_cqm(X, y, num_features=num_features, alpha=alpha, **kwargs)
+            cqm = self.mutual_information_cqm(
+                X, y, num_features=num_features, alpha=alpha, **kwargs)
         else:
             raise ValueError(f"only methods {self.acceptable_methods} are implemented")
 
@@ -432,8 +445,7 @@ def estimate_mi_matrix(
     conditional: bool = True,
     copy: bool = True,
     random_state: typing.Union[None, int] = None,
-    n_workers: int = 1,
-    n_subsample: int = -1
+    n_jobs: typing.Union[None, int] = None,
     ) -> npt.ArrayLike:
     """
     For the feature array `X` and the target array `y` computes
@@ -473,8 +485,12 @@ def estimate_mi_matrix(
         
         random_state: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
 
-        n_workers: int, default=1
-            Number of workers for parallel computation on the cpu       
+        n_jobs: int, default=None
+            The number of parallel jobs to run for the conditional mutual information
+            computation. The parallelization is over the columns of `X`.
+            ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+            ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+            for more details.
         
     Returns:
         mi_matrix : ndarray, shape (n_features, n_features)
@@ -490,7 +506,6 @@ def estimate_mi_matrix(
            mutual information estimation for mixed, discrete and continuous
            data." IEEE Transactions on Information Theory 67.1 (2020): 464-484.
     """
-    
     X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target)
     n_samples, n_features = X.shape
     
@@ -504,64 +519,56 @@ def estimate_mi_matrix(
         discrete_mask.fill(discrete_features)
     else:
         discrete_features = check_array(discrete_features, ensure_2d=False)
-        if discrete_features.dtype != "bool":
+        if np.issubdtype(discrete_features.dtype, bool):
             discrete_mask = np.zeros(n_features, dtype=bool)
             discrete_mask[discrete_features] = True
         else:
             discrete_mask = discrete_features
 
-    continuous_mask = ~discrete_mask
-    if np.any(continuous_mask) and issparse(X):
-        raise ValueError("Sparse matrix `X` can't have continuous features.")
-
-    rng = check_random_state(random_state)
-    if np.any(continuous_mask):
-        if copy:
-            X = X.copy()
-
-        X[:, continuous_mask] = scale(
-            X[:, continuous_mask], with_mean=False, copy=False
-        )
-
-        # Add small noise to continuous features as advised in Kraskov et. al.
-        X = X.astype(np.float64, copy=False)
-        means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))
-        X[:, continuous_mask] += (
-            1e-10
-            * means
-            * rng.standard_normal(size=(n_samples, np.sum(continuous_mask)))
-        )
-
-    if not discrete_target:
+    # copying X if needed
+    if copy:
+        X = X.copy()
+  
+    # Computing the diagonal terms
+    mutual_info_args = dict(X=X, y=y,
+                              discrete_features=discrete_mask,
+                              n_neighbors=n_neighbors,
+                              copy=False, random_state=random_state)
+    # In earlier versions of sklearn functions `mutual_info_classif` and 
+    # `mutual_info_regression` do not allow for parallel computations and
+    # do not have `n_jobs` argument.
+    if "n_jobs" in signature(mutual_info_classif).parameters and \
+    "n_jobs" in signature(mutual_info_regression).parameters:
+        mutual_info_args["n_jobs"] = n_jobs
+    if discrete_target:
+        diagonal_vals = mutual_info_classif(**mutual_info_args)
+    else:
+        diagonal_vals = mutual_info_regression(**mutual_info_args)
+        # the target y is not modified even if copy=False
+        # https://github.com/scikit-learn/scikit-learn/issues/28793
+        rng = check_random_state(random_state)        
         y = scale(y, with_mean=False)
         y += (
             1e-10
             * np.maximum(1, np.mean(np.abs(y)))
             * rng.standard_normal(size=n_samples)
-        )      
-    
-    n_features = X.shape[1]
-    max_n_workers = cpu_count()-1
-    if n_workers > max_n_workers:
-        n_workers = max_n_workers
-        Warning(f"Specified number of workers {n_workers} is larger than the number of cpus."
-                f"Will use only {max_n_workers}.")
-    
-    mi_matrix = np.zeros((n_features, n_features), dtype=np.float64)    
-    with tqdm_joblib(tqdm(desc="Computing off-diagonal terms", total=len(discrete_mask) * (len(discrete_mask) - 1) // 2 )) as progress_bar:
-        off_diagonal_vals = Parallel(n_jobs=n_workers)(
-            delayed(_compute_off_diagonal_mi)
-            (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors, conditional, n_subsample)
-            for (xi, discrete_feature_i), (xj, discrete_feature_j) in combinations(zip(_iterate_columns(X), discrete_mask), 2)
-            )
-    diagonal_vals = Parallel(n_jobs=n_workers)(
-        delayed(_compute_mi)
-        (xi, y, discrete_feature_i, discrete_target, n_neighbors)
-        for xi, discrete_feature_i in zip(_iterate_columns(X), discrete_mask)
         )
+
+    mi_matrix = np.zeros((n_features, n_features), dtype=np.float64)
     np.fill_diagonal(mi_matrix, diagonal_vals)
+    off_diagonal_iter = combinations(zip(_iterate_columns(X), discrete_mask), 2)
+    if conditional:
+        off_diagonal_vals = Parallel(n_jobs=n_jobs, verbose=1)(
+            delayed(_compute_off_diagonal_cmi)
+            (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors)
+            for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter)
+    else:
+        off_diagonal_vals = Parallel(n_jobs=n_jobs, verbose=1)(
+            delayed(_compute_off_diagonal_mi)
+            (xi, xj, discrete_feature_i, discrete_feature_j, n_neighbors)
+            for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter)
+            
     off_diagonal_val = iter(off_diagonal_vals)
     for i, j in combinations(range(n_features), 2):
         mi_matrix[i, j] = mi_matrix[j, i] = next(off_diagonal_val)
     return mi_matrix
-
diff --git a/dwave/plugins/sklearn/utilities.py b/dwave/plugins/sklearn/utilities.py
index 5910b53..dee7565 100644
--- a/dwave/plugins/sklearn/utilities.py
+++ b/dwave/plugins/sklearn/utilities.py
@@ -49,14 +49,13 @@
 
 import typing
 
-import contextlib
-import joblib
 import numpy as np
 import numpy.typing as npt
+from scipy.sparse import issparse
 from scipy.special import digamma
 from scipy.stats import entropy
-from sklearn.feature_selection._mutual_info import _compute_mi
 from sklearn.neighbors import NearestNeighbors, KDTree
+from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
 
 __all__ = ["corrcoef", "cov", "dot_2d"]
 
@@ -232,7 +231,7 @@ def dot_2d(a: npt.ArrayLike, b: npt.ArrayLike, *,
     return out
 
 
-def _compute_off_diagonal_mi(
+def _compute_off_diagonal_cmi(
     xi: npt.ArrayLike,
     xj: npt.ArrayLike,
     y: npt.ArrayLike,
@@ -240,16 +239,10 @@ def _compute_off_diagonal_mi(
     discrete_feature_j: bool,
     discrete_target: bool,
     n_neighbors: int = 4,
-    conditional: bool = True,
-    n_subsample: int = -1
     ):
     """
-    Computing a distance `d` between features `x_i` and `x_j`.
-    If `conditional = True`, then conditional mutual infomation is used
-    `I(x_i; y | x_j)`.
-    
-    If `conditonal = False` then mutual information is used 
-    `I(x_i; x_j)`.    
+    Computing a distance `d` based on the conditional mutual infomation
+    between features `x_i` and `x_j`: `d = (I(x_i; y | x_j)+I(x_j; y | x_i))/2`.
     
     References
     ----------
@@ -261,25 +254,45 @@ def _compute_off_diagonal_mi(
            mutual information estimation for mixed, discrete and continuous
            data." IEEE Transactions on Information Theory 67.1 (2020): 464-484.
     """
-    if conditional:
-        if n_subsample > 0:
-            idx = np.arange(xi.shape[0])
-            np.random.shuffle(idx)
-            idx = idx[:n_subsample]
-            xi, xj, y = xi[idx], xj[idx], y[idx]
-        if discrete_feature_i and discrete_feature_j and discrete_target:
-            val = _compute_cmip_d(xi, xj, y)
-        elif n_subsample >= 0 and discrete_target:            
-            val = _compute_cmip_ccd(xi, xj, y, n_neighbors)            
-        elif n_subsample >= 0 and discrete_feature_i and (not discrete_target):
-            val = _compute_cmip_cdc(xj, xi, y, n_neighbors)
-        elif n_subsample >= 0 and (not discrete_feature_i) and discrete_feature_j and (not discrete_target):
-            val = _compute_cmip_cdc(xi, xj, y, n_neighbors)
-        else:
-            val = _compute_cmip_c(xi, xj, y, n_neighbors)
-        return sum(val) / 2
+    if discrete_feature_i and discrete_feature_j and discrete_target:
+        val = _compute_cmip_d(xi, xj, y)
+    elif discrete_target:            
+        val = _compute_cmip_ccd(xi, xj, y, n_neighbors)            
+    elif discrete_feature_i and (not discrete_target):
+        val = _compute_cmip_cdc(xj, xi, y, n_neighbors)
+    elif (not discrete_feature_i) and discrete_feature_j and (not discrete_target):
+        val = _compute_cmip_cdc(xi, xj, y, n_neighbors)
     else:
-        return _compute_mi(xi, xj, discrete_feature_i, discrete_feature_j, n_neighbors)
+        val = _compute_cmip_c(xi, xj, y, n_neighbors)
+    return sum(val) / 2
+
+
+def _compute_off_diagonal_mi(
+    xi: npt.ArrayLike,
+    xj: npt.ArrayLike,
+    discrete_feature_i: bool,
+    discrete_feature_j: bool,
+    n_neighbors: int = 4,
+    ):
+    """
+    Compute mutual information between features `I(x_i; x_j)`.
+    
+    References
+    ----------
+    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    """
+    if discrete_feature_j:
+        return mutual_info_classif(
+            xi.reshape(-1, 1), xj, discrete_features=[discrete_feature_i], n_neighbors=n_neighbors)
+    elif discrete_feature_i:
+        return mutual_info_classif(
+            xj.reshape(-1, 1), xi, discrete_features=[discrete_feature_j], n_neighbors=n_neighbors)
+    else:
+        return mutual_info_regression(
+            xi.reshape(-1, 1), xj, discrete_features=[discrete_feature_i], n_neighbors=n_neighbors)
 
 
 def _compute_cmip_d(
@@ -368,7 +381,8 @@ def _compute_cmip_ccd(
     and `(I(x_j; y | x_i)`. Random variables `x_i`, `x_j` are assumed
     to be continuous, while `y` is assumed to be discrete.
     
-    Adapted from https://github.com/jannisteunissen/mutual_information
+    Adapted from https://github.com/jannisteunissen/mutual_information and 
+    :func:`sklearn.feature_selection._mutual_info._compute_mi_cd`
     """
     xi = xi.reshape((-1, 1))    
     xj = xj.reshape((-1, 1))
@@ -423,7 +437,8 @@ def _compute_cmip_cdc(
     and `(I(x_j; y | x_i)`. Random variables `x_i`, `y` are assumed
     to be continuous, while `x_j` is assumed to be discrete.
     
-    Adapted from https://github.com/jannisteunissen/mutual_information
+    Adapted from https://github.com/jannisteunissen/mutual_information and
+    :func:`sklearn.feature_selection._mutual_info._compute_mi_cd`
     """    
     xi = xi.reshape((-1, 1))
     y = y.reshape((-1, 1))
@@ -477,7 +492,10 @@ def _get_radius_k_neighbours_d(
     d: npt.ArrayLike,
     n_neighbors: int = 4
     ) -> typing.Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]:
-
+    """
+    Determine smallest radius around x containing n_neighbors neighbors
+    Inspired by :func:`sklearn.feature_selection._mutual_info._compute_mi_cd`
+    """
     c1 = c1.reshape((-1, 1))
     c2 = c2.reshape((-1, 1))
     n_samples = c1.shape[0]
@@ -520,7 +538,7 @@ def _num_points_within_radius_cd(
     radius: npt.ArrayLike) -> npt.ArrayLike:
     """
     For each point, determine the number of other points within a given radius
-    Inspired by _compute_mi_cd
+    Inspired by :func:`sklearn.feature_selection._mutual_info._compute_mi_cd`
     """
     c = c.reshape((-1, 1))
     n_samples = c.shape[0]
@@ -534,20 +552,33 @@ def _num_points_within_radius_cd(
     return m_all
 
 
-@contextlib.contextmanager
-def tqdm_joblib(tqdm_object):
-    """Context manager to patch joblib to report into tqdm progress bar given as argument
-    from  https://stackoverflow.com/a/58936697/5133167 
-    """
-    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
-        def __call__(self, *args, **kwargs):
-            tqdm_object.update(n=self.batch_size)
-            return super().__call__(*args, **kwargs)
+def _iterate_columns(X, columns=None):
+    """Iterate over columns of a matrix.
+    Copied from :func:`sklearn.feature_selection._mutual_info._iterate_columns`
 
-    old_batch_callback = joblib.parallel.BatchCompletionCallBack
-    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
-    try:
-        yield tqdm_object
-    finally:
-        joblib.parallel.BatchCompletionCallBack = old_batch_callback
-        tqdm_object.close()
+    Parameters
+    ----------
+    X : ndarray or csc_matrix, shape (n_samples, n_features)
+        Matrix over which to iterate.
+
+    columns : iterable or None, default=None
+        Indices of columns to iterate over. If None, iterate over all columns.
+
+    Yields
+    ------
+    x : ndarray, shape (n_samples,)
+        Columns of `X` in dense format.
+    """
+    if columns is None:
+        columns = range(X.shape[1])
+
+    if issparse(X):
+        for i in columns:
+            x = np.zeros(X.shape[0])
+            start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1]
+            x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr]
+            yield x
+    else:
+        for i in columns:
+            yield X[:, i]
+            
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 43cc4f6..ee3a4a4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,10 +5,7 @@ numpy==1.20.0; python_version<"3.10.0"  # oldest supported by dwave-system
 numpy==1.22.0; python_version~="3.10.0"  # needed for scipy on windows
 oldest-supported-numpy; python_version>="3.11.0"
 
-scikit-learn==1.2.0; python_version >= '3.8'
-scikit-learn==1.0.2; python_version < '3.8'
-joblib>=0.11
+scikit-learn==1.2.1
 scipy>=1.7.3
-tqdm>=4.65.2
 
 reno==3.5.0
diff --git a/setup.cfg b/setup.cfg
index e84055b..b03a6f0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -23,15 +23,13 @@ install_requires =
     dimod>=0.12.3
     dwave-system>=1.18.0
     numpy>=1.20.0
-    scikit-learn>=1.0.2
+    scikit-learn>=1.2.1
     scipy>=1.7.3
-    joblib>=0.11
-    tqdm>=4.65.0
 packages = 
     dwave
     dwave.plugins
     dwave.plugins.sklearn
-python_requires = >=3.7
+python_requires = >=3.8
 
 [pycodestyle]
 max-line-length = 100
diff --git a/tests/test_utilities.py b/tests/test_utilities.py
index aadaaef..0ab57c7 100644
--- a/tests/test_utilities.py
+++ b/tests/test_utilities.py
@@ -47,18 +47,29 @@
 
 import os.path
 import tempfile
+import typing
 import unittest
 
 import numpy as np
+import numpy.typing as npt
 
 from dwave.plugins.sklearn.utilities import (
-    corrcoef, cov, dot_2d, 
-    _compute_cmip_c, _compute_cmip_d, 
+    corrcoef, cov, dot_2d,
+    _compute_off_diagonal_cmi,
+    _compute_off_diagonal_mi,
+    _compute_cmip_c, _compute_cmip_d,
     _compute_cmip_cdc, _compute_cmip_ccd
 )
+from dwave.plugins.sklearn.transformers import estimate_mi_matrix
+from scipy.linalg import norm
+from scipy.sparse import issparse
 from scipy.special import digamma
 from sklearn.feature_selection._mutual_info import _compute_mi
+from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
 from sklearn.neighbors import KDTree
+from sklearn.preprocessing import scale
+from sklearn.utils import check_random_state
+from sklearn.utils.validation import check_array, check_X_y
 
 class TestCorrCoef(unittest.TestCase):
     def test_agreement(self):
@@ -235,7 +246,7 @@ def test_cmi(self):
         """
         We test the algorithm using the formula `I(x;y|z) = I(x;y) + I(z;y) - I(z;y|x)`.
         Since the mutual information data is an sklearn function,
-        :func:`sklearn.feature_selection._mutual_info._compute_mi`
+        :func:`sklearn.feature_selection._mutual_info.mutual_info_classif`
         it is highly likely that formula is valid only if our algorithm is correct.
         """
         np.random.seed(42)
@@ -249,16 +260,73 @@ def test_cmi(self):
         np.random.shuffle(xj)
         y = np.random.randint(-12, -10, (n_samples, ))
         cmi_ij, cmi_ji = _compute_cmip_d(xi, xj, y)
-        mi_i = _compute_mi(xi, y, True, True, n_neighbors=n_neighbors)
-        mi_j = _compute_mi(xj, y, True, True, n_neighbors=n_neighbors)
-        self.assertAlmostEqual(
-            cmi_ij + mi_j - mi_i, cmi_ji, places=5,
-            msg="The formula for discrete conditional mutual information is violated")
+        mi_i = mutual_info_classif(
+            xi.reshape(-1, 1), y, discrete_features=True, n_neighbors=n_neighbors)
+        mi_j = mutual_info_classif(
+            xj.reshape(-1, 1), y, discrete_features=True, n_neighbors=n_neighbors)
+        self.assertTrue(
+            np.allclose(cmi_ij + mi_j - mi_i, cmi_ji, atol=1e-5),
+            msg="The chain rule for discrete conditional mutual information is violated")
         cmi_ij2 = _compute_cmip_d(xj, xi, y)
-        self.assertAlmostEqual(
-            sum(cmi_ij2), cmi_ji+cmi_ij, places=5,
+        self.assertTrue(
+            np.allclose(sum(cmi_ij2), cmi_ji+cmi_ij, atol=1e-5),
             msg="Discrete conditional mutual information computation is not symmetric")
 
+    def test_mi(self):
+        np.random.seed(42)
+        # Test discrete implementation       
+        n_samples, n_ss, n_neighbors = 103, 51, 4
+        xi = np.random.randint(0, 11, (n_samples, ))
+        xj = np.hstack((
+            np.random.randint(-2, 1, (n_ss, )),
+            np.random.randint(10, 14, (n_samples-n_ss, ))))
+        np.random.shuffle(xi)
+        np.random.shuffle(xj)
+        mi_ij = _compute_off_diagonal_mi(
+            xi.reshape(-1, 1), xj, True, True, n_neighbors=n_neighbors)
+        mi_ji = _compute_off_diagonal_mi(
+            xi.reshape(-1, 1), xj, True, True, n_neighbors=n_neighbors)
+        mi_ij_cl = mutual_info_classif(xi.reshape(-1, 1), xj, discrete_features=True)
+        self.assertTrue(
+            np.allclose(mi_ij_cl, mi_ij),
+            msg="Discrete mutual information is not computed correctly")
+        self.assertTrue(
+            np.allclose(mi_ij_cl, mi_ji),
+            msg="Discrete mutual information is not computed correctly")
+
+        mi_ij = _compute_off_diagonal_mi(
+            xi, xj, False, True, n_neighbors=n_neighbors)
+        mi_ji = _compute_off_diagonal_mi(
+            xi, xj, False, True, n_neighbors=n_neighbors)
+        mi_ij_cl = mutual_info_classif(
+            xi.reshape(-1, 1), xj, discrete_features=False)
+        self.assertTrue(
+            np.allclose(mi_ij_cl, mi_ij),
+            msg="Mixed mutual information is not computed correctly")
+        self.assertTrue(
+            np.allclose(mi_ij_cl, mi_ji),
+            msg="Mixed mutual information is not computed correctly")
+
+        xj = np.random.randn(n_samples)
+        mi_ij = _compute_off_diagonal_mi(
+            xi, xj, True, False, n_neighbors=n_neighbors)
+        mi_ij_cl = mutual_info_regression(
+            xi.reshape(-1, 1), xj,
+            discrete_features=True, n_neighbors=n_neighbors)
+        self.assertTrue(
+            np.allclose(mi_ij_cl, mi_ij, atol=1e-5),
+            msg="Continuous mutual information is not computed correctly")
+
+        xi = np.random.randn(n_samples)
+        mi_ij = _compute_off_diagonal_mi(
+            xi, xj, False, False, n_neighbors=n_neighbors)
+        mi_ij_cl = mutual_info_regression(
+            xi.reshape(-1, 1), xj,
+            discrete_features=False, n_neighbors=n_neighbors)
+        self.assertTrue(
+            np.allclose(mi_ij_cl, mi_ij, atol=1e-3),
+            msg="Continuous mutual information is not computed correctly")
+        
     def test_implementation(self):
         # methods from https://github.com/jannisteunissen/mutual_information
         def _compute_cmi_t(x, z, y, n_neighbors):

From 0ab51dd05ce4ce5786058af69aef41e22b095243 Mon Sep 17 00:00:00 2001
From: Aivar Sootla <asootla@dwavesys.com>
Date: Tue, 16 Apr 2024 15:39:39 +0100
Subject: [PATCH 06/12] Incoporate comments from sklearn, fix circle.ci, fix
 tests, fix formating

---
 .circleci/config.yml                          |  17 +-
 dwave/plugins/sklearn/transformers.py         | 239 ++++++-----
 dwave/plugins/sklearn/utilities.py            | 371 ++++++++++--------
 .../notes/mutual-info-ddd1a23e320f88ae.yaml   |   5 +
 requirements.txt                              |   2 +-
 setup.cfg                                     |   4 +-
 tests/test_transformer.py                     |   4 +-
 tests/test_utilities.py                       | 107 ++---
 8 files changed, 387 insertions(+), 362 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 5c59d81..84a4ef1 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -205,22 +205,13 @@ workflows:
           name: test-linux-<< matrix.python-version >> | << matrix.pip-constraints >>
           matrix:
             parameters:
-              python-version: &python-versions ["3.7.9", "3.8.6", "3.9.0", "3.10.0", "3.11.0"]
+              python-version: &python-versions ["3.8.6", "3.9.0", "3.10.0", "3.11.0"]
               pip-constraints:
-                - "scikit-learn==1.0.2"  # lowest supported by package
+                - "scikit-learn==1.2.1"  # lowest supported by package
                 # - "scikit-learn~=1.0.0"  # 1.0.2 is the highest in ~=1.0.0
-                - "scikit-learn~=1.1.0"
-                - "scikit-learn~=1.2.0"  # latest in current minor as of March 2023
+                # - "scikit-learn~=1.1.0"
+                # - "scikit-learn~=1.2.0"  # latest in current minor as of March 2023
                 - "scikit-learn~=1.0"  # latest in current major
-            exclude:
-              # sklearn < 1.1.3 does not support Python 3.11
-              - python-version: "3.11.0"
-                pip-constraints: "scikit-learn==1.0.2"
-              # sklearn > 1.0.2 does not support Python 3.7
-              - python-version: "3.7.9"
-                pip-constraints: "scikit-learn~=1.1.0"
-              - python-version: "3.7.9"
-                pip-constraints: "scikit-learn~=1.2.0"
 
       - test-macos:
           matrix:
diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py
index 1333e00..01275d0 100644
--- a/dwave/plugins/sklearn/transformers.py
+++ b/dwave/plugins/sklearn/transformers.py
@@ -14,7 +14,6 @@
 
 from __future__ import annotations
 
-from inspect import signature
 from itertools import combinations
 import tempfile
 import typing
@@ -23,8 +22,8 @@
 from dwave.cloud.exceptions import ConfigFileError, SolverAuthenticationError
 from dwave.plugins.sklearn.utilities import (
     corrcoef,
-    _compute_off_diagonal_cmi,
-    _compute_off_diagonal_mi,
+    _compute_cmi_distance,
+    _compute_mi,
     _iterate_columns)
 from dwave.system import LeapHybridCQMSampler
 import numpy as np
@@ -56,21 +55,36 @@ class SelectFromQuadraticModel(SelectorMixin, BaseEstimator):
             :class:`sklearn.feature_selection.SelectKBest`.
         num_features:
             The number of features to select.
+        method:
+            If equal to ``correlation`` uses a correlation as a criterion for
+            choosing the features according to [1]_. If equal to ``mutual_information``
+            uses mutual information criteria [2]_ or [3]_ (advances feature).
         time_limit:
             The time limit for the run on the hybrid solver.
 
+    .. [1] [Milne et al.] Milne, Andrew, Maxwell Rounds, and Phil Goddard. 2017. "Optimal Feature
+            Selection in Credit Scoring and Classification Using a Quantum Annealer."
+            1QBit; White Paper.
+            https://1qbit.com/whitepaper/optimal-feature-selection-in-credit-scoring-classification-using-quantum-annealer
+    .. [2] Peng, F. Long, and C. Ding. Feature selection based on mutual information criteria
+            of max-dependency, max-relevance, and min-redundancy. IEEE Transactions on pattern
+            analysis and machine intelligence, 27(8):1226–1238, 2005.
+    .. [3] X. V. Nguyen, J. Chan, S. Romano, and J. Bailey. Effective global approaches for
+            mutual information based feature selection. In Proceedings of the 20th ACM SIGKDD
+            international conference on Knowledge discovery and data mining,
+            pages 512–521. ACM, 2014.
     """
 
     ACCEPTED_METHODS = [
         "correlation",
         "mutual_information",
-        ]
+    ]
 
     def __init__(
         self,
         *,
         alpha: float = .5,
-        method: str = "correlation",  # undocumented until there is another supported
+        method: str = "correlation",
         num_features: int = 10,
         time_limit: typing.Optional[float] = None,
     ):
@@ -128,7 +142,7 @@ def correlation_cqm(
         """Build a constrained quadratic model for feature selection.
 
         This method is based on maximizing influence and independence as
-        measured by correlation [Milne et al.]_.
+        measured by correlation [1]_.
 
         Args:
             X:
@@ -153,7 +167,7 @@ def correlation_cqm(
         Returns:
             A constrained quadratic model.
 
-        .. [Milne et al.] Milne, Andrew, Maxwell Rounds, and Phil Goddard. 2017. "Optimal Feature
+        .. [1] Milne, Andrew, Maxwell Rounds, and Phil Goddard. 2017. "Optimal Feature
             Selection in Credit Scoring and Classification Using a Quantum Annealer."
             1QBit; White Paper.
             https://1qbit.com/whitepaper/optimal-feature-selection-in-credit-scoring-classification-using-quantum-annealer
@@ -169,7 +183,7 @@ def correlation_cqm(
             '==' if strict else '<=',
             num_features,
             label=f"{num_features}-hot",
-            )
+        )
 
         with tempfile.TemporaryFile() as fX, tempfile.TemporaryFile() as fout:
             # we make a copy of X because we'll be modifying it in-place within
@@ -184,7 +198,7 @@ def correlation_cqm(
                 dtype=np.result_type(X, y),
                 mode="w+",
                 shape=(X_copy.shape[1], X_copy.shape[1]),
-                )            
+            )
             # main calculation. It modifies X_copy in-place
             corrcoef(X_copy, out=correlations, rowvar=False, copy=False)
             # we don't care about the direction of correlation in terms of
@@ -193,7 +207,7 @@ def correlation_cqm(
             # multiplying all but last columns and rows with (1 - alpha)
             np.multiply(correlations[:-1, :-1], (1 - alpha), out=correlations[:-1, :-1])
             # our objective
-            # we multiply by num_features to have consistent performance 
+            # we multiply by num_features to have consistent performance
             # with the increase of the number of features
             np.fill_diagonal(correlations, correlations[:, -1] * (- alpha * num_features))
             # Note: we only add terms on and above the diagonal
@@ -203,7 +217,7 @@ def correlation_cqm(
         return cqm
 
     def mutual_information_cqm(
-        self,   
+        self,
         X: npt.ArrayLike,
         y: npt.ArrayLike,
         *,
@@ -217,16 +231,16 @@ def mutual_information_cqm(
         n_neighbors: int = 4,
         n_jobs: typing.Union[None, int] = None,
         random_state: typing.Union[None, int] = None,
-        ) -> dimod.ConstrainedQuadraticModel:
+    ) -> dimod.ConstrainedQuadraticModel:
         """Build a constrained quadratic model for feature selection.
 
-        If ``conditional`` is True then the conditional mutual information 
+        If ``conditional`` is True then the conditional mutual information
         criterion from [2] is used, and if ``conditional`` is False then
         mutual information based criterion from [1] is used.
-        
-        For computation of mutual information and conditional mutual information 
-        
-        
+
+        For computation of mutual information and conditional mutual information
+
+
         Args:
             X:
                 Feature vectors formatted as a numerical 2D array-like.
@@ -249,16 +263,16 @@ def mutual_information_cqm(
             conditional: bool, default=True
                 Whether to compute the off-diagonal terms using the conditional mutual
                 information or joint mutual information
-            discrete_features: 
-                See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
-            discrete_target:
-                See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+            discrete_features:
+                See :func:`sklearn.feature_selection.mutual_info_regression`
+            discrete_target: bool, default=False
+                Whether the target variable `y` is discrete
             n_neighbors:
-                See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+                See :func:`sklearn.feature_selection.mutual_info_regression`
             copy:
-                See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+                See :func:`sklearn.feature_selection.mutual_info_regression`
             random_state:
-                See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+                See :func:`sklearn.feature_selection.mutual_info_regression`
             n_jobs: int, default=None
                 The number of parallel jobs to run for the conditional mutual information
                 computation. The parallelization is over the columns of `X`.
@@ -269,17 +283,18 @@ def mutual_information_cqm(
         Returns:
             A constrained quadratic model.
 
-        References:    
-        .. [1] Peng, F. Long, and C. Ding. Feature selection based on mutual information criteria of max-dependency,
-               max-relevance, and min-redundancy. IEEE Transactions on pattern analysis and machine intelligence,
-               27(8):1226–1238, 2005.
-        .. [2] X. V. Nguyen, J. Chan, S. Romano, and J. Bailey. Effective global approaches for mutual information 
-               based feature selection. In Proceedings of the 20th ACM SIGKDD international conference on 
-               Knowledge discovery and data mining, pages 512–521. ACM, 2014.
+        References:
+        .. [1] Peng, F. Long, and C. Ding. Feature selection based on mutual information criteria
+               of max-dependency, max-relevance, and min-redundancy. IEEE Transactions on pattern
+               analysis and machine intelligence, 27(8):1226–1238, 2005.
+        .. [2] X. V. Nguyen, J. Chan, S. Romano, and J. Bailey. Effective global approaches for
+               mutual information based feature selection. In Proceedings of the 20th ACM SIGKDD
+               international conference on Knowledge discovery and data mining,
+               pages 512–521. ACM, 2014.
         """
-        
+
         self._check_params(X, y, alpha, num_features)
-        
+
         cqm = dimod.ConstrainedQuadraticModel()
         cqm.add_variables(dimod.BINARY, X.shape[1])
 
@@ -289,30 +304,30 @@ def mutual_information_cqm(
             '==' if strict else '<=',
             num_features,
             label=f"{num_features}-hot",
-            )
+        )
 
         mi = estimate_mi_matrix(
-            X, y, 
-            discrete_features=discrete_features, 
+            X, y,
+            discrete_features=discrete_features,
             discrete_target=discrete_target,
             n_neighbors=n_neighbors, copy=copy,
             random_state=random_state, n_jobs=n_jobs,
             conditional=conditional)
-        
-        if conditional:            
+
+        if conditional:
+            # method from [2]_.
             np.multiply(mi, -1, out=mi)
         else:
+            # method from [1]_.
             # mutpliypling off-diagonal ones with -1
             diagonal = -np.diag(mi).copy()
             # mutpliypling off-diagonal ones with alpha
-            np.multiply(mi, alpha, out=mi)            
+            np.multiply(mi, alpha, out=mi)
             np.fill_diagonal(mi, diagonal)
-        
-        self.mi_matrix = mi
-        
+
         it = np.nditer(mi, flags=['multi_index'], op_flags=[['readonly']])
         cqm.set_objective((*it.multi_index, x) for x in it if x)
-        return cqm        
+        return cqm
 
     def fit(
         self,
@@ -416,7 +431,7 @@ def _check_params(X: npt.ArrayLike,
                       num_features: int):
         X = np.atleast_2d(np.asarray(X))
         y = np.asarray(y)
-        
+
         if X.ndim != 2:
             raise ValueError("X must be a 2-dimensional array-like")
 
@@ -424,7 +439,8 @@ def _check_params(X: npt.ArrayLike,
             raise ValueError("y must be a 1-dimensional array-like")
 
         if y.shape[0] != X.shape[0]:
-            raise ValueError(f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}")
+            raise ValueError(
+                f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}")
 
         if not 0 <= alpha <= 1:
             raise ValueError(f"alpha must be between 0 and 1, given {alpha}")
@@ -436,54 +452,63 @@ def _check_params(X: npt.ArrayLike,
             raise ValueError("X must have at least two rows")
 
 
-def estimate_mi_matrix(    
+def estimate_mi_matrix(
     X: npt.ArrayLike,
     y: npt.ArrayLike,
-    discrete_features: typing.Union[str, npt.ArrayLike]="auto",
+    discrete_features: typing.Union[str, npt.ArrayLike] = "auto",
     discrete_target: bool = False,
     n_neighbors: int = 4,
     conditional: bool = True,
     copy: bool = True,
     random_state: typing.Union[None, int] = None,
     n_jobs: typing.Union[None, int] = None,
-    ) -> npt.ArrayLike:
+) -> npt.ArrayLike:
     """
     For the feature array `X` and the target array `y` computes
-    the matrix of (conditional) mutual information interactions. 
-    
-    cmi_{i, j} = I(x_i; y)
-    
+    the matrix of (conditional) mutual information interactions.
+    The matrix is defined as follows:
+
+    `M_(i, i) = I(x_i; y)`
+
     If `conditional = True`, then the off-diagonal terms are computed:
-    
-    cmi_{i, j} = (I(x_i; y| x_j) + I(x_j; y| x_i)) / 2
-    
+
+    `M_(i, j) = (I(x_i; y| x_j) + I(x_j; y| x_i)) / 2`
+
     Otherwise
-    
-    cmi_{i, j} = I(x_i; x_j)
-    
-    Computation of I(x; y) uses the scikit-learn implementation, i.e.,
-    :func:`sklearn.feature_selection._mutual_info._estimate_mi`. The computation 
-    of I(x; y| z) is based on
-    https://github.com/jannisteunissen/mutual_information
+
+    `M_(i, j) = I(x_i; x_j)`
+
+    Computation of I(x; y) uses modified scikit-learn methods.
+    The computation of I(x; y| z) is based on
+    https://github.com/jannisteunissen/mutual_information and [3]_.
+
+    The method can be computationally expensive for a large number of features (> 1000) and
+    a large number of samples (> 100000). In this case, it can be advisable to downsample the
+    dataset.
+
+    The estimation methods are linear in the number of outcomes (labels) of discrete distributions.
+    It may be beneficial to treat the discrete distrubitions with a large number of labels as
+    continuous distributions.
 
     Args:
-        X: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
-        
-        y: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
-        
+        X: See :func:`sklearn.feature_selection.mutual_info_regression`
+
+        y: See :func:`sklearn.feature_selection.mutual_info_regression`
+
         conditional: bool, default=True
             Whether to compute the off-diagonal terms using the conditional mutual
             information or joint mutual information
 
-        discrete_features: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
-        
-        discrete_target: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
-        
-        n_neighbors: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
-        
-        copy: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
-        
-        random_state: See :func:`sklearn.feature_selection._mutual_info._estimate_mi`
+        discrete_features: See :func:`sklearn.feature_selection.mutual_info_regression`
+
+        discrete_target: bool, default=False
+            Whether the target variable `y` is discrete
+
+        n_neighbors: See :func:`sklearn.feature_selection.mutual_info_regression`
+
+        copy: See :func:`sklearn.feature_selection.mutual_info_regression`
+
+        random_state: See :func:`sklearn.feature_selection.mutual_info_regression`
 
         n_jobs: int, default=None
             The number of parallel jobs to run for the conditional mutual information
@@ -491,13 +516,13 @@ def estimate_mi_matrix(
             ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
             ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
             for more details.
-        
+
     Returns:
         mi_matrix : ndarray, shape (n_features, n_features)
         Interaction matrix between the features using (conditional) mutual information.
         A negative value will be replaced by 0.
 
-    References:    
+    References:
     .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
            information". Phys. Rev. E 69, 2004.
     .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
@@ -508,7 +533,7 @@ def estimate_mi_matrix(
     """
     X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target)
     n_samples, n_features = X.shape
-    
+
     if isinstance(discrete_features, (str, bool)):
         if isinstance(discrete_features, str):
             if discrete_features == "auto":
@@ -525,28 +550,26 @@ def estimate_mi_matrix(
         else:
             discrete_mask = discrete_features
 
-    # copying X if needed
-    if copy:
-        X = X.copy()
-  
-    # Computing the diagonal terms
-    mutual_info_args = dict(X=X, y=y,
-                              discrete_features=discrete_mask,
-                              n_neighbors=n_neighbors,
-                              copy=False, random_state=random_state)
-    # In earlier versions of sklearn functions `mutual_info_classif` and 
-    # `mutual_info_regression` do not allow for parallel computations and
-    # do not have `n_jobs` argument.
-    if "n_jobs" in signature(mutual_info_classif).parameters and \
-    "n_jobs" in signature(mutual_info_regression).parameters:
-        mutual_info_args["n_jobs"] = n_jobs
-    if discrete_target:
-        diagonal_vals = mutual_info_classif(**mutual_info_args)
-    else:
-        diagonal_vals = mutual_info_regression(**mutual_info_args)
-        # the target y is not modified even if copy=False
-        # https://github.com/scikit-learn/scikit-learn/issues/28793
-        rng = check_random_state(random_state)        
+    continuous_mask = ~discrete_mask
+    if np.any(continuous_mask) and issparse(X):
+        raise ValueError("Sparse matrix `X` can't have continuous features.")
+
+    rng = check_random_state(random_state)
+    if np.any(continuous_mask):
+        X = X.astype(np.float64, copy=copy)
+        X[:, continuous_mask] = scale(
+            X[:, continuous_mask], with_mean=False, copy=False
+        )
+
+        # Add small noise to continuous features as advised in Kraskov et. al.
+        means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))
+        X[:, continuous_mask] += (
+            1e-10
+            * means
+            * rng.standard_normal(size=(n_samples, np.sum(continuous_mask)))
+        )
+
+    if not discrete_target:
         y = scale(y, with_mean=False)
         y += (
             1e-10
@@ -555,19 +578,25 @@ def estimate_mi_matrix(
         )
 
     mi_matrix = np.zeros((n_features, n_features), dtype=np.float64)
+    # Computing the diagonal terms
+    diagonal_vals = Parallel(n_jobs=n_jobs)(
+        delayed(_compute_mi)(x, y, discrete_feature, discrete_target, n_neighbors)
+        for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
+    )
     np.fill_diagonal(mi_matrix, diagonal_vals)
+    # Computing the off-diagonal terms
     off_diagonal_iter = combinations(zip(_iterate_columns(X), discrete_mask), 2)
     if conditional:
-        off_diagonal_vals = Parallel(n_jobs=n_jobs, verbose=1)(
-            delayed(_compute_off_diagonal_cmi)
+        off_diagonal_vals = Parallel(n_jobs=n_jobs)(
+            delayed(_compute_cmi_distance)
             (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors)
             for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter)
     else:
-        off_diagonal_vals = Parallel(n_jobs=n_jobs, verbose=1)(
-            delayed(_compute_off_diagonal_mi)
+        off_diagonal_vals = Parallel(n_jobs=n_jobs)(
+            delayed(_compute_mi)
             (xi, xj, discrete_feature_i, discrete_feature_j, n_neighbors)
             for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter)
-            
+
     off_diagonal_val = iter(off_diagonal_vals)
     for i, j in combinations(range(n_features), 2):
         mi_matrix[i, j] = mi_matrix[j, i] = next(off_diagonal_val)
diff --git a/dwave/plugins/sklearn/utilities.py b/dwave/plugins/sklearn/utilities.py
index dee7565..bbdf956 100644
--- a/dwave/plugins/sklearn/utilities.py
+++ b/dwave/plugins/sklearn/utilities.py
@@ -54,8 +54,8 @@
 from scipy.sparse import issparse
 from scipy.special import digamma
 from scipy.stats import entropy
+from sklearn.metrics.cluster import mutual_info_score
 from sklearn.neighbors import NearestNeighbors, KDTree
-from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
 
 __all__ = ["corrcoef", "cov", "dot_2d"]
 
@@ -231,7 +231,7 @@ def dot_2d(a: npt.ArrayLike, b: npt.ArrayLike, *,
     return out
 
 
-def _compute_off_diagonal_cmi(
+def _compute_cmi_distance(
     xi: npt.ArrayLike,
     xj: npt.ArrayLike,
     y: npt.ArrayLike,
@@ -239,11 +239,11 @@ def _compute_off_diagonal_cmi(
     discrete_feature_j: bool,
     discrete_target: bool,
     n_neighbors: int = 4,
-    ):
+):
     """
     Computing a distance `d` based on the conditional mutual infomation
     between features `x_i` and `x_j`: `d = (I(x_i; y | x_j)+I(x_j; y | x_i))/2`.
-    
+
     References
     ----------
     .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
@@ -255,99 +255,68 @@ def _compute_off_diagonal_cmi(
            data." IEEE Transactions on Information Theory 67.1 (2020): 464-484.
     """
     if discrete_feature_i and discrete_feature_j and discrete_target:
-        val = _compute_cmip_d(xi, xj, y)
-    elif discrete_target:            
-        val = _compute_cmip_ccd(xi, xj, y, n_neighbors)            
+        ans = _compute_cmip_d(xi, xj, y)
+    elif discrete_target:
+        ans = _compute_cmip_ccd(xi, xj, y, n_neighbors)
     elif discrete_feature_i and (not discrete_target):
-        val = _compute_cmip_cdc(xj, xi, y, n_neighbors)
+        ans = _compute_cmip_cdc(xj, xi, y, n_neighbors)
     elif (not discrete_feature_i) and discrete_feature_j and (not discrete_target):
-        val = _compute_cmip_cdc(xi, xj, y, n_neighbors)
+        ans = _compute_cmip_cdc(xi, xj, y, n_neighbors)
     else:
-        val = _compute_cmip_c(xi, xj, y, n_neighbors)
-    return sum(val) / 2
-
-
-def _compute_off_diagonal_mi(
-    xi: npt.ArrayLike,
-    xj: npt.ArrayLike,
-    discrete_feature_i: bool,
-    discrete_feature_j: bool,
-    n_neighbors: int = 4,
-    ):
-    """
-    Compute mutual information between features `I(x_i; x_j)`.
-    
-    References
-    ----------
-    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
-           information". Phys. Rev. E 69, 2004.
-    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
-           Data Sets". PLoS ONE 9(2), 2014.
-    """
-    if discrete_feature_j:
-        return mutual_info_classif(
-            xi.reshape(-1, 1), xj, discrete_features=[discrete_feature_i], n_neighbors=n_neighbors)
-    elif discrete_feature_i:
-        return mutual_info_classif(
-            xj.reshape(-1, 1), xi, discrete_features=[discrete_feature_j], n_neighbors=n_neighbors)
-    else:
-        return mutual_info_regression(
-            xi.reshape(-1, 1), xj, discrete_features=[discrete_feature_i], n_neighbors=n_neighbors)
+        ans = _compute_cmip_c(xi, xj, y, n_neighbors)
+    return np.mean(ans)
 
 
 def _compute_cmip_d(
-    xi: npt.ArrayLike,
-    xj: npt.ArrayLike,
-    y: npt.ArrayLike) -> typing.Tuple[float]:
+        xi: npt.ArrayLike,
+        xj: npt.ArrayLike,
+        y: npt.ArrayLike) -> typing.Tuple[float]:
     """
-    Computes conditional mutual infomation pair `I(x_i; y | x_j)` `I(x_j; y | x_i)`.
+    Computes conditional mutual infomation pair `I(x_i; y | x_j)` and `I(x_j; y | x_i)`.
     All random variables `x_i`, `x_j` and `y` are discrete.
-    
+
     Adpated from https://github.com/dwave-examples/mutual-information-feature-selection
     """
-    
     # Computing joint probability distribution for the features and the target
     bin_boundaries = [np.hstack((np.unique(col), np.inf)) for col in (xi, xj, y)]
     prob, _ = np.histogramdd(np.vstack((xi, xj, y)).T, bins=bin_boundaries)
-    Hijy = entropy(prob.flatten())   
-    Hij = entropy(prob.sum(axis=2).flatten())
+
+    # Computing entropy
+    Hijy = entropy(prob.flatten())  # H(x_i, x_j, y)
+    Hij = entropy(prob.sum(axis=2).flatten())  # H(x_i, x_j)
     cmi_ij = (
-        Hij # H(x_i, x_j)
-        +entropy(prob.sum(axis=0).flatten()) # H(x_j, y)
-        -entropy(prob.sum(axis=(0,2))) # H(x_j)
-        -Hijy # H(x_i, x_j, y)
+        Hij-Hijy
+        + entropy(prob.sum(axis=0).flatten())  # H(x_j, y)
+        - entropy(prob.sum(axis=(0, 2)))  # H(x_j)
     )
     cmi_ji = (
-        Hij # H(x_i, x_j)
-        +entropy(prob.sum(axis=1).flatten()) # H(x_i, y)
-        -entropy(prob.sum(axis=(1,2))) # H(x_i)
-        -Hijy # H(x_i, x_j, y)        
+        Hij-Hijy
+        + entropy(prob.sum(axis=1).flatten())  # H(x_i, y)
+        - entropy(prob.sum(axis=(1, 2)))  # H(x_i)
     )
     return cmi_ij, cmi_ji
 
 
 def _compute_cmip_c(
-    xi: npt.ArrayLike,
-    xj: npt.ArrayLike,
-    y: npt.ArrayLike,
-    n_neighbors: int = 4) -> typing.Tuple[float]:
+        xi: npt.ArrayLike,
+        xj: npt.ArrayLike,
+        y: npt.ArrayLike,
+        n_neighbors: int = 4) -> typing.Tuple[float]:
     """
-    Computes conditional mutual information pair `(I(x_i; y | x_j)`
-    and `(I(x_j; y | x_i)`. All random variables `x_i`, `x_j` and `y` 
+    Computes conditional mutual information pair `I(x_i; y | x_j)`
+    and `I(x_j; y | x_i)`. All random variables `x_i`, `x_j` and `y`
     are assumed to be continuous.
-    
+
     Adapted from https://github.com/jannisteunissen/mutual_information
     """
-    xi = xi.reshape((-1, 1))    
+    xi = xi.reshape((-1, 1))
     xj = xj.reshape((-1, 1))
     y = y.reshape((-1, 1))
-    
+
     # Here we rely on NearestNeighbors to select the fastest algorithm.
-    nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors)
-    nn.fit(np.hstack((xi, xj, y)))
-    radius = nn.kneighbors()[0]
-    radius = np.nextafter(radius[:, -1], 0)
-       
+    radius = _get_radius_k_neighbours(np.hstack((xi, xj, y)),
+                                      n_neighbors=n_neighbors)
+
     # KDTree is explicitly fit to allow for the querying of number of
     # neighbors within a specified radius
     n_j = _num_points_within_radius(xj, radius)
@@ -357,41 +326,28 @@ def _compute_cmip_c(
     n_i = _num_points_within_radius(xi, radius)
     n_iy = _num_points_within_radius(np.hstack((y, xi)), radius)
 
-    cmi_ij = (
-        digamma(n_neighbors)
-        +np.mean(digamma(n_j))
-        -np.mean(digamma(n_ij))        
-        -np.mean(digamma(n_jy)))
-
-    cmi_ji = (
-        digamma(n_neighbors)
-        +np.mean(digamma(n_i))
-        -np.mean(digamma(n_ij))        
-        -np.mean(digamma(n_iy)))
-    return max(0, cmi_ij), max(0, cmi_ji)
+    return _get_cmi_pair_from_estimates([n_neighbors], n_ij, n_j, n_jy, n_i, n_iy)
 
 
 def _compute_cmip_ccd(
-    xi: npt.ArrayLike,
-    xj: npt.ArrayLike,
-    y: npt.ArrayLike,
-    n_neighbors: int = 4) -> typing.Tuple[float]:
+        xi: npt.ArrayLike,
+        xj: npt.ArrayLike,
+        y: npt.ArrayLike,
+        n_neighbors: int = 4) -> typing.Tuple[float]:
     """
-    Computes conditional mutual information pair `(I(x_i; y | x_j)`
-    and `(I(x_j; y | x_i)`. Random variables `x_i`, `x_j` are assumed
+    Computes conditional mutual information pair `I(x_i; y | x_j)`
+    and `I(x_j; y | x_i)`. Random variables `x_i`, `x_j` are assumed
     to be continuous, while `y` is assumed to be discrete.
-    
-    Adapted from https://github.com/jannisteunissen/mutual_information and 
-    :func:`sklearn.feature_selection._mutual_info._compute_mi_cd`
+
+    Adapted from https://github.com/jannisteunissen/mutual_information and
+    :func:`sklearn.feature_selection.mutual_info_regression`
     """
-    xi = xi.reshape((-1, 1))    
+    xi = xi.reshape((-1, 1))
     xj = xj.reshape((-1, 1))
 
-    (radius,
-     label_counts,
-     k_all) = _get_radius_k_neighbours_d(
-         xi, xj, y,
-         n_neighbors=n_neighbors)
+    # Here we rely on NearestNeighbors to select the fastest algorithm.
+    radius, label_counts, k_all = _get_radius_k_neighbours_d(np.hstack((xi, xj)), y,
+                                                             n_neighbors=n_neighbors)
 
     # Ignore points with unique labels.
     mask = label_counts > 1
@@ -403,51 +359,36 @@ def _compute_cmip_ccd(
     radius = radius[mask]
     # KDTree is explicitly fit to allow for the querying of number of
     # neighbors within a specified radius
-    
-    # continuous 
+    # continuous
     n_i = _num_points_within_radius(xi, radius)
     n_j = _num_points_within_radius(xj, radius)
     n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius)
-    
-    # mixed estimates    
+    # mixed discrete/continuous estimates
     n_iy = _num_points_within_radius_cd(xi, y, radius)
     n_jy = _num_points_within_radius_cd(xj, y, radius)
-    
-    common_terms = (
-        np.mean(digamma(k_all))
-        -np.mean(digamma(n_ij)))
-    cmi_ij = (
-        common_terms
-        +np.mean(digamma(n_j))
-        -np.mean(digamma(n_jy)))
-    cmi_ji = (
-        common_terms
-        +np.mean(digamma(n_i))
-        -np.mean(digamma(n_iy)))
-    return max(0, cmi_ij), max(0, cmi_ji)
+
+    return _get_cmi_pair_from_estimates(k_all, n_ij, n_j, n_jy, n_i, n_iy)
 
 
 def _compute_cmip_cdc(
-    xi: npt.ArrayLike,
-    xj: npt.ArrayLike,
-    y: npt.ArrayLike,
-    n_neighbors: int = 4) -> typing.Tuple[float]:
+        xi: npt.ArrayLike,
+        xj: npt.ArrayLike,
+        y: npt.ArrayLike,
+        n_neighbors: int = 4) -> typing.Tuple[float]:
     """
-    Computes conditional mutual information pair `(I(x_i; y | x_j)`
-    and `(I(x_j; y | x_i)`. Random variables `x_i`, `y` are assumed
+    Computes conditional mutual information pair `I(x_i; y | x_j)`
+    and `I(x_j; y | x_i)`. Random variables `x_i`, `y` are assumed
     to be continuous, while `x_j` is assumed to be discrete.
-    
+
     Adapted from https://github.com/jannisteunissen/mutual_information and
-    :func:`sklearn.feature_selection._mutual_info._compute_mi_cd`
-    """    
+    :func:`sklearn.feature_selection.mutual_info_regression`
+    """
     xi = xi.reshape((-1, 1))
     y = y.reshape((-1, 1))
 
-    (radius,
-     label_counts,
-     k_all) = _get_radius_k_neighbours_d(
-         xi, y, xj,
-         n_neighbors=n_neighbors)
+    # Here we rely on NearestNeighbors to select the fastest algorithm.
+    radius, label_counts, k_all = _get_radius_k_neighbours_d(np.hstack((xi, y)), xj,
+                                                             n_neighbors=n_neighbors)
 
     # Ignore points with unique labels.
     mask = label_counts > 1
@@ -457,71 +398,163 @@ def _compute_cmip_cdc(
     xj = xj[mask]
     y = y[mask]
     radius = radius[mask]
-    
+
     # KDTree is explicitly fit to allow for the querying of number of
     # neighbors within a specified radius
-    
-    # continuous 
+    # continuous
     n_i = _num_points_within_radius(xi, radius)
     n_iy = _num_points_within_radius(np.hstack((xi, y)), radius)
-    
-    # mixed estimates
+    # mixed coninuous/discrete estimates
     n_ij = _num_points_within_radius_cd(xi, xj, radius)
     n_jy = _num_points_within_radius_cd(y, xj, radius)
-    
     # discrete estimates
     n_j = label_counts
-    
-    common_terms = (
-        np.mean(digamma(k_all))
-        -np.mean(digamma(n_ij)))
-    cmi_ij = (
-        common_terms
-        +np.mean(digamma(n_j))
-        -np.mean(digamma(n_jy)))
-    cmi_ji = (
-        common_terms
-        +np.mean(digamma(n_i))
-        -np.mean(digamma(n_iy)))
+
+    return _get_cmi_pair_from_estimates(k_all, n_ij, n_j, n_jy, n_i, n_iy)
+
+
+def _get_cmi_pair_from_estimates(
+        k_all: npt.ArrayLike, n_ij: npt.ArrayLike, n_j: npt.ArrayLike,
+        n_jy: npt.ArrayLike, n_i: npt.ArrayLike, n_iy: npt.ArrayLike) -> typing.Tuple[float]:
+
+    common_terms = np.mean(digamma(k_all))-np.mean(digamma(n_ij))
+    cmi_ij = common_terms+np.mean(digamma(n_j))-np.mean(digamma(n_jy))
+    cmi_ji = common_terms+np.mean(digamma(n_i))-np.mean(digamma(n_iy))
     return max(0, cmi_ij), max(0, cmi_ji)
 
 
+def _compute_mi(
+    x: npt.ArrayLike,
+    y: npt.ArrayLike,
+    discrete_feature_x: bool,
+    discrete_feature_y: bool,
+    n_neighbors: int = 4,
+):
+    """
+    Compute mutual information between features `I(x_i; x_j)`.
+
+    References
+    ----------
+    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    """
+    if discrete_feature_y and discrete_feature_x:
+        return mutual_info_score(x, y)
+    elif discrete_feature_x:
+        return _compute_mi_cd(y, x, n_neighbors=n_neighbors)
+    elif discrete_feature_y:
+        return _compute_mi_cd(x, y, n_neighbors=n_neighbors)
+    else:
+        return _compute_mi_cc(x, y, n_neighbors=n_neighbors)
+
+
+def _compute_mi_cc(x: npt.ArrayLike, y: npt.ArrayLike, n_neighbors: int) -> float:
+    """Computes mutual information `I(x; y)`. Random variables `x`, `y` are assumed
+    to be continuous.
+
+    Adapted from :func:`sklearn.feature_selection.mutual_info_regression`
+    """
+    n_samples = x.size
+
+    x = x.reshape((-1, 1))
+    y = y.reshape((-1, 1))
+
+    # Here we rely on NearestNeighbors to select the fastest algorithm.
+    radius = _get_radius_k_neighbours(np.hstack((x, y)), n_neighbors)
+
+    # KDTree is explicitly fit to allow for the querying of number of
+    # neighbors within a specified radius
+    nx = _num_points_within_radius(x, radius)
+    ny = _num_points_within_radius(y, radius)
+
+    mi = (
+        digamma(n_samples)
+        + digamma(n_neighbors)
+        - np.mean(digamma(nx))
+        - np.mean(digamma(ny))
+    )
+
+    return max(0, mi)
+
+
+def _compute_mi_cd(x: npt.ArrayLike, y: npt.ArrayLike, n_neighbors: int) -> float:
+    """Computes mutual information `I(x; y)`. Random variable `x`, is assumed
+    to be continuous, while `y` is assumed to be discrete.
+
+    Adapted from :func:`sklearn.feature_selection.mutual_info_regression`
+    """
+    radius, label_counts, k_all = _get_radius_k_neighbours_d(x, y, n_neighbors=n_neighbors)
+
+    # Ignore points with unique labels.
+    mask = label_counts > 1
+    n_samples = np.sum(mask)
+    label_counts = label_counts[mask]
+    k_all = k_all[mask]
+    x = x[mask]
+    radius = radius[mask]
+
+    m_all = _num_points_within_radius(x.reshape(-1, 1), radius)
+
+    mi = (
+        digamma(n_samples)
+        + np.mean(digamma(k_all))
+        - np.mean(digamma(label_counts))
+        - np.mean(digamma(m_all))
+    )
+
+    return max(0, mi)
+
+
+def _get_radius_k_neighbours(X: npt.ArrayLike, n_neighbors: int = 4) -> npt.ArrayLike:
+    """
+    Determine smallest radius around `X` containing `n_neighbors` neighbors
+    Inspired by :func:`sklearn.feature_selection.mutual_info_regression`
+    """
+    nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors)
+    return _get_radius_from_nn(X, nn)
+
+
 def _get_radius_k_neighbours_d(
-    c1: npt.ArrayLike,
-    c2: npt.ArrayLike,
+    c: npt.ArrayLike,
     d: npt.ArrayLike,
     n_neighbors: int = 4
-    ) -> typing.Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]:
+) -> typing.Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]:
     """
-    Determine smallest radius around x containing n_neighbors neighbors
-    Inspired by :func:`sklearn.feature_selection._mutual_info._compute_mi_cd`
+    Determine smallest radius around `c` and `d` containing `n_neighbors` neighbors
+    Inspired by :func:`sklearn.feature_selection.mutual_info_regression`
     """
-    c1 = c1.reshape((-1, 1))
-    c2 = c2.reshape((-1, 1))
-    n_samples = c1.shape[0]
+    d = d.flatten()
+    n_samples = d.shape[0]
+    c = c.reshape((n_samples, -1))
+
     radius = np.empty(n_samples)
     label_counts = np.empty(n_samples)
     k_all = np.empty(n_samples)
     nn = NearestNeighbors(metric="chebyshev")
-    
     for label in np.unique(d):
-        mask = d == label        
+        mask = d == label
         count = np.sum(mask)
         if count > 1:
             k = min(n_neighbors, count - 1)
             nn.set_params(n_neighbors=k)
-            nn.fit(np.hstack((c1[mask], c2[mask])))
-            r = nn.kneighbors()[0]
-            radius[mask] = np.nextafter(r[:, -1], 0)
+            radius[mask] = _get_radius_from_nn(c[mask], nn)
             k_all[mask] = k
-        label_counts[mask] = np.sum(mask)        
+        label_counts[mask] = np.sum(mask)
     return radius, label_counts, k_all
 
 
-def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike):
+def _get_radius_from_nn(X: npt.ArrayLike, nn: NearestNeighbors) -> npt.ArrayLike:
+    nn.fit(X)
+    r = nn.kneighbors()[0]
+    return np.nextafter(r[:, -1], 0)
+
+
+def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike) -> npt.ArrayLike:
     """For each point, determine the number of other points within a given radius
     Function from https://github.com/jannisteunissen/mutual_information
-    
+
     :param x: ndarray, shape (n_samples, n_dim)
     :param radius: radius, shape (n_samples,)
     :returns: number of points within radius
@@ -533,28 +566,25 @@ def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike):
 
 
 def _num_points_within_radius_cd(
-    c: npt.ArrayLike,
-    d: npt.ArrayLike,
-    radius: npt.ArrayLike) -> npt.ArrayLike:
+        c: npt.ArrayLike,
+        d: npt.ArrayLike,
+        radius: npt.ArrayLike) -> npt.ArrayLike:
     """
     For each point, determine the number of other points within a given radius
-    Inspired by :func:`sklearn.feature_selection._mutual_info._compute_mi_cd`
+    Inspired by :func:`sklearn.feature_selection.mutual_info_regression`
     """
     c = c.reshape((-1, 1))
     n_samples = c.shape[0]
     m_all = np.empty(n_samples)
     for label in np.unique(d):
         mask = d == label
-        kd = KDTree(c[mask], metric="chebyshev")
-        m_all[mask] = kd.query_radius(
-            c[mask], radius[mask],
-            count_only=True, return_distance=False)
+        m_all[mask] = _num_points_within_radius(c[mask], radius[mask])
     return m_all
 
 
 def _iterate_columns(X, columns=None):
     """Iterate over columns of a matrix.
-    Copied from :func:`sklearn.feature_selection._mutual_info._iterate_columns`
+    Copied from :func:`sklearn.feature_selection._mutual_info`
 
     Parameters
     ----------
@@ -581,4 +611,3 @@ def _iterate_columns(X, columns=None):
     else:
         for i in columns:
             yield X[:, i]
-            
\ No newline at end of file
diff --git a/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml b/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml
index c3ed08c..b809dc2 100644
--- a/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml
+++ b/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml
@@ -2,3 +2,8 @@
 features:
   - |
     Add feature selection method using (conditional) mutual information
+deprecations:
+  - |
+    Drop support for Python 3.7.
+  - |
+    Drop support for scikit-learn 1.2.0
diff --git a/requirements.txt b/requirements.txt
index ee3a4a4..c6e0887 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,6 @@ numpy==1.22.0; python_version~="3.10.0"  # needed for scipy on windows
 oldest-supported-numpy; python_version>="3.11.0"
 
 scikit-learn==1.2.1
-scipy>=1.7.3
+scipy==1.9.0
 
 reno==3.5.0
diff --git a/setup.cfg b/setup.cfg
index b03a6f0..b036d93 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -24,8 +24,8 @@ install_requires =
     dwave-system>=1.18.0
     numpy>=1.20.0
     scikit-learn>=1.2.1
-    scipy>=1.7.3
-packages = 
+    scipy>=1.9.0
+packages =
     dwave
     dwave.plugins
     dwave.plugins.sklearn
diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index abfa25f..cfe4c49 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -122,14 +122,14 @@ def test_pipeline(self):
 
         clf.predict(X)
 
-    def test_alpha_0(self):        
+    def test_alpha_0(self):
         cqm = SelectFromQuadraticModel().correlation_cqm(X=self.X, y=self.y, num_features=3, alpha=0)
         self.assertTrue(not any(cqm.objective.linear.values()))
 
     def test_alpha_1_no_quadratic(self):
         cqm = SelectFromQuadraticModel().correlation_cqm(X=self.X, y=self.y, num_features=3, alpha=1)
         self.assertTrue(not any(cqm.objective.quadratic.values()))
-        
+
     def test_alpha_1(self):
         rng = np.random.default_rng(42)
 
diff --git a/tests/test_utilities.py b/tests/test_utilities.py
index 0ab57c7..364a64b 100644
--- a/tests/test_utilities.py
+++ b/tests/test_utilities.py
@@ -45,31 +45,22 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os.path
 import tempfile
-import typing
 import unittest
 
 import numpy as np
-import numpy.typing as npt
 
 from dwave.plugins.sklearn.utilities import (
     corrcoef, cov, dot_2d,
-    _compute_off_diagonal_cmi,
-    _compute_off_diagonal_mi,
+    _compute_mi,
     _compute_cmip_c, _compute_cmip_d,
     _compute_cmip_cdc, _compute_cmip_ccd
 )
-from dwave.plugins.sklearn.transformers import estimate_mi_matrix
-from scipy.linalg import norm
-from scipy.sparse import issparse
 from scipy.special import digamma
-from sklearn.feature_selection._mutual_info import _compute_mi
 from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
 from sklearn.neighbors import KDTree
 from sklearn.preprocessing import scale
-from sklearn.utils import check_random_state
-from sklearn.utils.validation import check_array, check_X_y
+
 
 class TestCorrCoef(unittest.TestCase):
     def test_agreement(self):
@@ -198,8 +189,8 @@ def test_memmap(self):
 
 
 class TestMI(unittest.TestCase):
-    def test_cmi_cdc(self):
-        np.random.seed(42)        
+    def test_cmi_mixed(self):
+        np.random.seed(42)
         n_samples, n_neighbors = 4003, 4
         c1 = np.random.randn(n_samples,)
         n_ss = 1001
@@ -219,7 +210,7 @@ def test_cmi_cdc(self):
             msg="Computation for mixed continuous/discrete conditional mutual information is not accurate")
 
     def test_cmi_symmetry(self):
-        np.random.seed(42)        
+        np.random.seed(42)
         n_samples, n_neighbors = 4003, 4
         # Test continuous implementation
         Xy = np.random.randn(n_samples, 3)
@@ -240,9 +231,9 @@ def test_cmi_symmetry(self):
         cmi_ji_pair = _compute_cmip_ccd(c2, c1, d, n_neighbors=n_neighbors)
         self.assertAlmostEqual(
             sum(cmi_ij_pair), sum(cmi_ji_pair), places=3,
-            msg="Computation for mixed continuous/discrete conditional mutual information is not symmetric")        
+            msg="Computation for mixed continuous/discrete conditional mutual information is not symmetric")
 
-    def test_cmi(self):
+    def test_cmi_discrete(self):
         """
         We test the algorithm using the formula `I(x;y|z) = I(x;y) + I(z;y) - I(z;y|x)`.
         Since the mutual information data is an sklearn function,
@@ -250,7 +241,7 @@ def test_cmi(self):
         it is highly likely that formula is valid only if our algorithm is correct.
         """
         np.random.seed(42)
-        # Test discrete implementation       
+        # Test discrete implementation
         n_samples, n_ss, n_neighbors = 103, 51, 4
         xi = np.random.randint(0, 11, (n_samples, ))
         xj = np.hstack((
@@ -273,61 +264,41 @@ def test_cmi(self):
             msg="Discrete conditional mutual information computation is not symmetric")
 
     def test_mi(self):
-        np.random.seed(42)
-        # Test discrete implementation       
-        n_samples, n_ss, n_neighbors = 103, 51, 4
+        from sklearn.feature_selection._mutual_info import _compute_mi_cd, _compute_mi_cc
+        from sklearn.metrics.cluster import mutual_info_score
+        # Test discrete implementation
+        n_samples, n_ss, n_neighbors, random_state = 103, 51, 4, 15
+        np.random.seed(random_state)
         xi = np.random.randint(0, 11, (n_samples, ))
         xj = np.hstack((
             np.random.randint(-2, 1, (n_ss, )),
             np.random.randint(10, 14, (n_samples-n_ss, ))))
         np.random.shuffle(xi)
         np.random.shuffle(xj)
-        mi_ij = _compute_off_diagonal_mi(
-            xi.reshape(-1, 1), xj, True, True, n_neighbors=n_neighbors)
-        mi_ji = _compute_off_diagonal_mi(
-            xi.reshape(-1, 1), xj, True, True, n_neighbors=n_neighbors)
-        mi_ij_cl = mutual_info_classif(xi.reshape(-1, 1), xj, discrete_features=True)
+        xi_c = scale(np.random.randn(n_samples), with_mean=False)
+        xj_c = scale(np.random.randn(n_samples), with_mean=False)
+
+        mi_ij = _compute_mi(
+            xi, xj, True, True, n_neighbors=n_neighbors)
+        mi_ij_cl = mutual_info_score(xi, xj)
         self.assertTrue(
             np.allclose(mi_ij_cl, mi_ij),
             msg="Discrete mutual information is not computed correctly")
-        self.assertTrue(
-            np.allclose(mi_ij_cl, mi_ji),
-            msg="Discrete mutual information is not computed correctly")
 
-        mi_ij = _compute_off_diagonal_mi(
-            xi, xj, False, True, n_neighbors=n_neighbors)
-        mi_ji = _compute_off_diagonal_mi(
-            xi, xj, False, True, n_neighbors=n_neighbors)
-        mi_ij_cl = mutual_info_classif(
-            xi.reshape(-1, 1), xj, discrete_features=False)
-        self.assertTrue(
-            np.allclose(mi_ij_cl, mi_ij),
-            msg="Mixed mutual information is not computed correctly")
-        self.assertTrue(
-            np.allclose(mi_ij_cl, mi_ji),
-            msg="Mixed mutual information is not computed correctly")
-
-        xj = np.random.randn(n_samples)
-        mi_ij = _compute_off_diagonal_mi(
-            xi, xj, True, False, n_neighbors=n_neighbors)
-        mi_ij_cl = mutual_info_regression(
-            xi.reshape(-1, 1), xj,
-            discrete_features=True, n_neighbors=n_neighbors)
+        mi_ij = _compute_mi(
+            xi_c, xj, False, True, n_neighbors=n_neighbors)
+        mi_ij_cl = _compute_mi_cd(xi_c, xj, n_neighbors=n_neighbors)
         self.assertTrue(
             np.allclose(mi_ij_cl, mi_ij, atol=1e-5),
-            msg="Continuous mutual information is not computed correctly")
-
-        xi = np.random.randn(n_samples)
-        mi_ij = _compute_off_diagonal_mi(
-            xi, xj, False, False, n_neighbors=n_neighbors)
-        mi_ij_cl = mutual_info_regression(
-            xi.reshape(-1, 1), xj,
-            discrete_features=False, n_neighbors=n_neighbors)
+            msg=f"Error in continuous features/discrete target MI estimation is larger than expected {abs(mi_ij_cl-mi_ij)}")
+
+        mi_ij = _compute_mi(xi_c, xj_c, False, False, n_neighbors=n_neighbors)
+        mi_ij_cl = _compute_mi_cc(xi_c, xj_c, n_neighbors=n_neighbors)
         self.assertTrue(
-            np.allclose(mi_ij_cl, mi_ij, atol=1e-3),
-            msg="Continuous mutual information is not computed correctly")
-        
-    def test_implementation(self):
+            np.allclose(mi_ij_cl, mi_ij, atol=1e-5),
+            msg=f"Error in purely continuous MI is larger than expected {abs(mi_ij_cl-mi_ij)}")
+
+    def test_cmi_continuous(self):
         # methods from https://github.com/jannisteunissen/mutual_information
         def _compute_cmi_t(x, z, y, n_neighbors):
             n_samples = len(x)
@@ -338,20 +309,20 @@ def _compute_cmi_t(x, z, y, n_neighbors):
             xyz = np.hstack((x, y, z))
             k = np.full(n_samples, n_neighbors)
             radius = get_radius_kneighbors(xyz, n_neighbors)
-            
+
             mask = (radius == 0)
             if mask.sum() > 0:
                 vals, ix, counts = np.unique(xyz[mask], axis=0,
-                                        return_inverse=True,
-                                        return_counts=True)
+                                             return_inverse=True,
+                                             return_counts=True)
                 k[mask] = counts[ix] - 1
-            
+
             nxz = num_points_within_radius(np.hstack((x, z)), radius)
             nyz = num_points_within_radius(np.hstack((y, z)), radius)
             nz = num_points_within_radius(z, radius)
 
             cmi = max(0, np.mean(digamma(k)) - np.mean(digamma(nxz + 1))
-                    - np.mean(digamma(nyz + 1)) + np.mean(digamma(nz + 1)))
+                      - np.mean(digamma(nyz + 1)) + np.mean(digamma(nz + 1)))
             return cmi
 
         def get_radius_kneighbors(x, n_neighbors):
@@ -381,15 +352,15 @@ def num_points_within_radius(x, radius):
             kd = KDTree(x, metric="chebyshev")
             nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
             return np.array(nx) - 1.0
-        
+
         np.random.seed(42)
         n_samples, n_neighbors = 103, 4
         # Test continuous implementation
         Xy = np.random.randn(n_samples, 3)
-        cmi_t_01 = _compute_cmi_t(Xy[:,0], Xy[:,1], Xy[:,2], n_neighbors=n_neighbors)
-        cmi_t_10 = _compute_cmi_t(Xy[:,1], Xy[:,0], Xy[:,2], n_neighbors=n_neighbors)
+        cmi_t_01 = _compute_cmi_t(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors)
+        cmi_t_10 = _compute_cmi_t(Xy[:, 1], Xy[:, 0], Xy[:, 2], n_neighbors=n_neighbors)
 
-        cmi_ij, cmi_ji = _compute_cmip_c(Xy[:,0], Xy[:,1], Xy[:,2], n_neighbors=n_neighbors)
+        cmi_ij, cmi_ji = _compute_cmip_c(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors)
 
         self.assertAlmostEqual(
             max(cmi_t_01, 0), cmi_ij, places=5,

From db27e38a43c43d89adba341236bc5d1ec26240e5 Mon Sep 17 00:00:00 2001
From: Aivar Sootla <asootla@dwavesys.com>
Date: Thu, 18 Apr 2024 11:50:14 +0100
Subject: [PATCH 07/12] fix scipy version

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index c6e0887..4b995a4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ numpy==1.22.0; python_version~="3.10.0"  # needed for scipy on windows
 oldest-supported-numpy; python_version>="3.11.0"
 
 scikit-learn==1.2.1
-scipy==1.9.0
+scipy==1.9.0; python_version~="3.11.0"
+scipy==1.9.2; python_version=="3.11.0"
 
 reno==3.5.0

From 443b070171c3334d64a568dc3702a46dd307c36d Mon Sep 17 00:00:00 2001
From: Aivar Sootla <asootla@dwavesys.com>
Date: Thu, 18 Apr 2024 11:58:40 +0100
Subject: [PATCH 08/12] fix requirements bug

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4b995a4..cc26dd2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ numpy==1.22.0; python_version~="3.10.0"  # needed for scipy on windows
 oldest-supported-numpy; python_version>="3.11.0"
 
 scikit-learn==1.2.1
-scipy==1.9.0; python_version~="3.11.0"
+scipy==1.9.0; python_version<"3.11.0"
 scipy==1.9.2; python_version=="3.11.0"
 
 reno==3.5.0

From b2c3b077fc65e4402a2da065dd3a35ccc2cd74f3 Mon Sep 17 00:00:00 2001
From: Aivar Sootla <asootla@dwavesys.com>
Date: Fri, 19 Apr 2024 14:56:55 +0000
Subject: [PATCH 09/12] refactor, add new tests, add licence headers

---
 .circleci/config.yml                          |   3 -
 .../sklearn/_conditional_mutual_info.py       | 764 ++++++++++++++++++
 dwave/plugins/sklearn/transformers.py         | 165 +---
 dwave/plugins/sklearn/utilities.py            | 387 ---------
 tests/test_mutual_info.py                     | 369 +++++++++
 tests/test_utilities.py                       | 194 +----
 6 files changed, 1137 insertions(+), 745 deletions(-)
 create mode 100644 dwave/plugins/sklearn/_conditional_mutual_info.py
 create mode 100644 tests/test_mutual_info.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 84a4ef1..845014e 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -208,9 +208,6 @@ workflows:
               python-version: &python-versions ["3.8.6", "3.9.0", "3.10.0", "3.11.0"]
               pip-constraints:
                 - "scikit-learn==1.2.1"  # lowest supported by package
-                # - "scikit-learn~=1.0.0"  # 1.0.2 is the highest in ~=1.0.0
-                # - "scikit-learn~=1.1.0"
-                # - "scikit-learn~=1.2.0"  # latest in current minor as of March 2023
                 - "scikit-learn~=1.0"  # latest in current major
 
       - test-macos:
diff --git a/dwave/plugins/sklearn/_conditional_mutual_info.py b/dwave/plugins/sklearn/_conditional_mutual_info.py
new file mode 100644
index 0000000..9e5a546
--- /dev/null
+++ b/dwave/plugins/sklearn/_conditional_mutual_info.py
@@ -0,0 +1,764 @@
+# The following traversal code is adapted from two sources
+# I. Scikit-learn implementation for mutual information computation
+# II. Computation of conditional mutual information in the repo
+#       https://github.com/jannisteunissen/mutual_information
+#
+# I. Methods and algorithms from `sklearn.feature_selection._mutual_info.py`
+#
+# Author: Nikolay Mayorov <n59_ru@hotmail.com>
+# License: 3-clause BSD
+#
+# II. Modifications in https://github.com/jannisteunissen/mutual_information
+# BSD 3-Clause License
+#
+# Copyright (c) 2022, Jannis Teunissen
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Modifications are distributed under the Apache 2.0 Software license.
+#
+# Copyright 2023 D-Wave Systems Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import typing
+
+import numpy as np
+import numpy.typing as npt
+
+from scipy.sparse import issparse
+from scipy.special import digamma
+from scipy.stats import entropy
+from sklearn.metrics.cluster import mutual_info_score
+from sklearn.neighbors import NearestNeighbors, KDTree
+from sklearn.preprocessing import scale
+from sklearn.utils import check_random_state
+from sklearn.utils.parallel import Parallel, delayed
+from sklearn.utils.validation import check_array, check_X_y
+
+
+def estimate_mi_matrix(
+    X: npt.ArrayLike,
+    y: npt.ArrayLike,
+    discrete_features: typing.Union[str, npt.ArrayLike] = "auto",
+    discrete_target: bool = False,
+    n_neighbors: int = 4,
+    conditional: bool = True,
+    copy: bool = True,
+    random_state: typing.Union[None, int] = None,
+    n_jobs: typing.Union[None, int] = None,
+) -> npt.ArrayLike:
+    """
+    For the feature array `X` and the target array `y` computes
+    the matrix of (conditional) mutual information interactions.
+    The matrix is defined as follows:
+
+    `M_(i, i) = I(x_i; y)`
+
+    If `conditional = True`, then the off-diagonal terms are computed:
+
+    `M_(i, j) = (I(x_i; y| x_j) + I(x_j; y| x_i)) / 2`
+
+    Otherwise
+
+    `M_(i, j) = I(x_i; x_j)`
+
+    Computation of I(x; y) uses modified scikit-learn methods.
+    The computation of I(x; y| z) is based on
+    https://github.com/jannisteunissen/mutual_information and [3]_.
+
+    The method can be computationally expensive for a large number of features (> 1000) and
+    a large number of samples (> 100000). In this case, it can be advisable to downsample the
+    dataset.
+
+    The estimation methods are linear in the number of outcomes (labels) of discrete distributions.
+    It may be beneficial to treat the discrete distrubitions with a large number of labels as
+    continuous distributions.
+
+    Args:
+        X: See :func:`sklearn.feature_selection.mutual_info_regression`
+
+        y: See :func:`sklearn.feature_selection.mutual_info_regression`
+
+        conditional: bool, default=True
+            Whether to compute the off-diagonal terms using the conditional mutual
+            information or joint mutual information
+
+        discrete_features: See :func:`sklearn.feature_selection.mutual_info_regression`
+
+        discrete_target: bool, default=False
+            Whether the target variable `y` is discrete
+
+        n_neighbors: See :func:`sklearn.feature_selection.mutual_info_regression`
+
+        copy: See :func:`sklearn.feature_selection.mutual_info_regression`
+
+        random_state: See :func:`sklearn.feature_selection.mutual_info_regression`
+
+        n_jobs: int, default=None
+            The number of parallel jobs to run for the conditional mutual information
+            computation. The parallelization is over the columns of `X`.
+            ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+            ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+            for more details.
+
+    Returns:
+        mi_matrix : ndarray, shape (n_features, n_features)
+        Interaction matrix between the features using (conditional) mutual information.
+        A negative value will be replaced by 0.
+
+    References:
+    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional
+           mutual information estimation for mixed, discrete and continuous
+           data." IEEE Transactions on Information Theory 67.1 (2020): 464-484.
+    """
+    X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target)
+    n_samples, n_features = X.shape
+
+    if isinstance(discrete_features, (str, bool)):
+        if isinstance(discrete_features, str):
+            if discrete_features == "auto":
+                discrete_features = issparse(X)
+            else:
+                raise ValueError("Invalid string value for discrete_features.")
+        discrete_mask = np.empty(n_features, dtype=bool)
+        discrete_mask.fill(discrete_features)
+    else:
+        discrete_features = check_array(discrete_features, ensure_2d=False)
+        if np.issubdtype(discrete_features.dtype, bool):
+            discrete_mask = np.zeros(n_features, dtype=bool)
+            discrete_mask[discrete_features] = True
+        else:
+            discrete_mask = discrete_features
+
+    continuous_mask = ~discrete_mask
+    if np.any(continuous_mask) and issparse(X):
+        raise ValueError("Sparse matrix `X` can't have continuous features.")
+
+    rng = check_random_state(random_state)
+    if np.any(continuous_mask):
+        X = X.astype(np.float64, copy=copy)
+        X[:, continuous_mask] = scale(
+            X[:, continuous_mask], with_mean=False, copy=False
+        )
+
+        # Add small noise to continuous features as advised in Kraskov et. al.
+        means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))
+        X[:, continuous_mask] += (
+            1e-10
+            * means
+            * rng.standard_normal(size=(n_samples, np.sum(continuous_mask)))
+        )
+
+    if not discrete_target:
+        y = scale(y, with_mean=False)
+        y += (
+            1e-10
+            * np.maximum(1, np.mean(np.abs(y)))
+            * rng.standard_normal(size=n_samples)
+        )
+
+    mi_matrix = np.zeros((n_features, n_features), dtype=np.float64)
+    # Computing the diagonal terms
+    diagonal_vals = Parallel(n_jobs=n_jobs)(
+        delayed(_compute_mi)(x, y, discrete_feature, discrete_target, n_neighbors)
+        for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
+    )
+    np.fill_diagonal(mi_matrix, diagonal_vals)
+    # Computing the off-diagonal terms
+    off_diagonal_iter = combinations(zip(_iterate_columns(X), discrete_mask), 2)
+    if conditional:
+        off_diagonal_vals = Parallel(n_jobs=n_jobs)(
+            delayed(_compute_cmi_distance)
+            (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors)
+            for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter)
+    else:
+        off_diagonal_vals = Parallel(n_jobs=n_jobs)(
+            delayed(_compute_mi)
+            (xi, xj, discrete_feature_i, discrete_feature_j, n_neighbors)
+            for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter)
+
+    off_diagonal_val = iter(off_diagonal_vals)
+    for i, j in combinations(range(n_features), 2):
+        mi_matrix[i, j] = mi_matrix[j, i] = next(off_diagonal_val)
+    return mi_matrix
+
+
+def _compute_cmi_distance(
+    xi: npt.ArrayLike,
+    xj: npt.ArrayLike,
+    y: npt.ArrayLike,
+    discrete_feature_i: bool,
+    discrete_feature_j: bool,
+    discrete_target: bool,
+    n_neighbors: int = 4,
+) -> float:
+    """
+    Computes a distance `d` based on the conditional mutual information
+    between features `x_i` and `x_j`: `d = (I(x_i; y | x_j)+I(x_j; y | x_i))/2`.
+    Args:
+        xi:
+            A feature vector formatted as a numerical 1D array-like.
+        xj:
+            A feature vector formatted as a numerical 1D array-like.
+        y:
+            Target vector formatted as a numerical 1D array-like.
+        discrete_feature_i:
+            Whether to consider `xi` as a discrete variable.
+        discrete_feature_j:
+            Whether to consider `xj` as a discrete variable.
+        discrete_target:
+            Whether to consider `y` as a discrete variable.
+        n_neighbors:
+            Number of neighbors to use for MI estimation for continuous variables,
+            see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+            could introduce a bias.
+    Returns:
+        Distance between features based on conditional mutual information.
+    References
+    ----------
+    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional
+           mutual information estimation for mixed, discrete and continuous
+           data." IEEE Transactions on Information Theory 67.1 (2020): 464-484.
+    """
+    if discrete_feature_i and discrete_feature_j and discrete_target:
+        ans = _compute_cmip_d(xi, xj, y)
+    elif discrete_target:
+        ans = _compute_cmip_ccd(xi, xj, y, n_neighbors)
+    elif discrete_feature_i and (not discrete_target):
+        ans = _compute_cmip_cdc(xj, xi, y, n_neighbors)
+    elif (not discrete_feature_i) and discrete_feature_j and (not discrete_target):
+        ans = _compute_cmip_cdc(xi, xj, y, n_neighbors)
+    else:
+        ans = _compute_cmip_c(xi, xj, y, n_neighbors)
+    return np.mean(ans)
+
+
+def _compute_cmip_d(
+        xi: npt.ArrayLike,
+        xj: npt.ArrayLike,
+        y: npt.ArrayLike) -> typing.Tuple[float]:
+    """
+    Computes conditional mutual information pair `I(x_i; y | x_j)` and `I(x_j; y | x_i)`.
+    All random variables `x_i`, `x_j` and `y` are discrete.
+
+    Adapted from https://github.com/dwave-examples/mutual-information-feature-selection
+    Args:
+        xi:
+            A feature vector formatted as a numerical 1D array-like.
+        xj:
+            A feature vector formatted as a numerical 1D array-like.
+        y:
+            Target vector formatted as a numerical 1D array-like.
+        n_neighbors:
+            Number of neighbors to use for MI estimation for continuous variables,
+            see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+            could introduce a bias.
+    Returns:
+        Pair of conditional mutual information values between two features and the target vector.
+    """
+    # Computing joint probability distribution for the features and the target
+    bin_boundaries = [np.hstack((np.unique(col), np.inf)) for col in (xi, xj, y)]
+    prob, _ = np.histogramdd(np.vstack((xi, xj, y)).T, bins=bin_boundaries)
+
+    # Computing entropy
+    Hijy = entropy(prob.flatten())  # H(x_i, x_j, y)
+    Hij = entropy(prob.sum(axis=2).flatten())  # H(x_i, x_j)
+    cmi_ij = (
+        Hij-Hijy
+        + entropy(prob.sum(axis=0).flatten())  # H(x_j, y)
+        - entropy(prob.sum(axis=(0, 2)))  # H(x_j)
+    )
+    cmi_ji = (
+        Hij-Hijy
+        + entropy(prob.sum(axis=1).flatten())  # H(x_i, y)
+        - entropy(prob.sum(axis=(1, 2)))  # H(x_i)
+    )
+    return cmi_ij, cmi_ji
+
+
+def _compute_cmip_c(
+        xi: npt.ArrayLike,
+        xj: npt.ArrayLike,
+        y: npt.ArrayLike,
+        n_neighbors: int = 4) -> typing.Tuple[float]:
+    """
+    Computes conditional mutual information pair `I(x_i; y | x_j)`
+    and `I(x_j; y | x_i)`. All random variables `x_i`, `x_j` and `y`
+    are assumed to be continuous.
+
+    Adapted from https://github.com/jannisteunissen/mutual_information
+    :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+
+    Args:
+        xi:
+            A feature vector formatted as a numerical 1D array-like.
+        xj:
+            A feature vector formatted as a numerical 1D array-like.
+        y:
+            Target vector formatted as a numerical 1D array-like.
+        n_neighbors:
+            Number of neighbors to use for MI estimation for continuous variables,
+            see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+            could introduce a bias.
+    Returns:
+        Pair of conditional mutual information values between two features and the target vector.
+    """
+    xi = xi.reshape((-1, 1))
+    xj = xj.reshape((-1, 1))
+    y = y.reshape((-1, 1))
+
+    # Here we rely on NearestNeighbors to select the fastest algorithm.
+    radius = _get_radius_k_neighbours(np.hstack((xi, xj, y)),
+                                      n_neighbors=n_neighbors)
+
+    # KDTree is explicitly fit to allow for the querying of number of
+    # neighbors within a specified radius
+    n_j = _num_points_within_radius(xj, radius)
+    n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius)
+    n_jy = _num_points_within_radius(np.hstack((y, xj)), radius)
+
+    n_i = _num_points_within_radius(xi, radius)
+    n_iy = _num_points_within_radius(np.hstack((y, xi)), radius)
+
+    return _get_cmi_pair_from_estimates([n_neighbors], n_ij, n_j, n_jy, n_i, n_iy)
+
+
+def _compute_cmip_ccd(
+        xi: npt.ArrayLike,
+        xj: npt.ArrayLike,
+        y: npt.ArrayLike,
+        n_neighbors: int = 4) -> typing.Tuple[float]:
+    """
+    Computes conditional mutual information pair `I(x_i; y | x_j)`
+    and `I(x_j; y | x_i)`. Random variables `x_i`, `x_j` are assumed
+    to be continuous, while `y` is assumed to be discrete.
+
+    Adapted from https://github.com/jannisteunissen/mutual_information
+    :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+    Args:
+        xi:
+            A feature vector formatted as a numerical 1D array-like.
+        xj:
+            A feature vector formatted as a numerical 1D array-like.
+        y:
+            Target vector formatted as a numerical 1D array-like.
+        n_neighbors:
+            Number of neighbors to use for MI estimation for continuous variables,
+            see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+            could introduce a bias.
+    Returns:
+        Pair of conditional mutual information values between two features and the target vector.
+    """
+    xi = xi.reshape((-1, 1))
+    xj = xj.reshape((-1, 1))
+
+    # Here we rely on NearestNeighbors to select the fastest algorithm.
+    radius, label_counts, k_all = _get_radius_k_neighbours_d(np.hstack((xi, xj)), y,
+                                                             n_neighbors=n_neighbors)
+
+    # Ignore points with unique labels.
+    mask = label_counts > 1
+    # label_counts = label_counts[mask]
+    k_all = k_all[mask]
+    xi = xi[mask]
+    xj = xj[mask]
+    y = y[mask]
+    radius = radius[mask]
+    # KDTree is explicitly fit to allow for the querying of number of
+    # neighbors within a specified radius
+    # continuous
+    n_i = _num_points_within_radius(xi, radius)
+    n_j = _num_points_within_radius(xj, radius)
+    n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius)
+    # mixed discrete/continuous estimates
+    n_iy = _num_points_within_radius_cd(xi, y, radius)
+    n_jy = _num_points_within_radius_cd(xj, y, radius)
+
+    return _get_cmi_pair_from_estimates(k_all, n_ij, n_j, n_jy, n_i, n_iy)
+
+
+def _compute_cmip_cdc(
+        xi: npt.ArrayLike,
+        xj: npt.ArrayLike,
+        y: npt.ArrayLike,
+        n_neighbors: int = 4) -> typing.Tuple[float]:
+    """
+    Computes conditional mutual information pair `I(x_i; y | x_j)`
+    and `I(x_j; y | x_i)`. Random variables `x_i`, `y` are assumed
+    to be continuous, while `x_j` is assumed to be discrete.
+
+    Adapted from https://github.com/jannisteunissen/mutual_information
+    :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+    Args:
+        xi:
+            A feature vector formatted as a numerical 1D array-like.
+        xj:
+            A feature vector formatted as a numerical 1D array-like.
+        y:
+            Target vector formatted as a numerical 1D array-like.
+        n_neighbors:
+            Number of neighbors to use for MI estimation for continuous variables,
+            see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+            could introduce a bias.
+    Returns:
+        Pair of conditional mutual information values between two features and the target vector.
+    """
+    xi = xi.reshape((-1, 1))
+    y = y.reshape((-1, 1))
+
+    # Here we rely on NearestNeighbors to select the fastest algorithm.
+    radius, label_counts, k_all = _get_radius_k_neighbours_d(np.hstack((xi, y)), xj,
+                                                             n_neighbors=n_neighbors)
+
+    # Ignore points with unique labels.
+    mask = label_counts > 1
+    label_counts = label_counts[mask]
+    k_all = k_all[mask]
+    xi = xi[mask]
+    xj = xj[mask]
+    y = y[mask]
+    radius = radius[mask]
+
+    # KDTree is explicitly fit to allow for the querying of number of
+    # neighbors within a specified radius
+    # continuous
+    n_i = _num_points_within_radius(xi, radius)
+    n_iy = _num_points_within_radius(np.hstack((xi, y)), radius)
+    # mixed coninuous/discrete estimates
+    n_ij = _num_points_within_radius_cd(xi, xj, radius)
+    n_jy = _num_points_within_radius_cd(y, xj, radius)
+    # discrete estimates
+    n_j = label_counts
+
+    return _get_cmi_pair_from_estimates(k_all, n_ij, n_j, n_jy, n_i, n_iy)
+
+
+def _get_cmi_pair_from_estimates(
+        k_all: npt.ArrayLike, n_ij: npt.ArrayLike, n_j: npt.ArrayLike,
+        n_jy: npt.ArrayLike, n_i: npt.ArrayLike, n_iy: npt.ArrayLike) -> typing.Tuple[float]:
+    """Get an estimate from nearest neighbors counts"""
+    common_terms = np.mean(digamma(k_all))-np.mean(digamma(n_ij))
+    cmi_ij = common_terms+np.mean(digamma(n_j))-np.mean(digamma(n_jy))
+    cmi_ji = common_terms+np.mean(digamma(n_i))-np.mean(digamma(n_iy))
+    return max(0, cmi_ij), max(0, cmi_ji)
+
+
+def _compute_mi(
+    x: npt.ArrayLike,
+    y: npt.ArrayLike,
+    discrete_feature_x: bool,
+    discrete_feature_y: bool,
+    n_neighbors: int = 4,
+):
+    """
+    Compute mutual information between features `I(x_i; x_j)`.
+
+    Adapted from :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+
+    Args:
+        x, y:
+            Feature vectors each formatted as a numerical 1D array-like.
+        discrete_feature_x, discrete_feature_y:
+            Whether to consider `x` and `y` as discrete variables.
+        n_neighbors:
+            Number of neighbors to use for MI estimation for continuous variables,
+            see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+            could introduce a bias.
+    Returns:
+        Mutual information between feature vectors x and y.
+    References
+    ----------
+    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    """
+    if discrete_feature_y and discrete_feature_x:
+        return mutual_info_score(x, y)
+    elif discrete_feature_x:
+        return _compute_mi_cd(y, x, n_neighbors=n_neighbors)
+    elif discrete_feature_y:
+        return _compute_mi_cd(x, y, n_neighbors=n_neighbors)
+    else:
+        return _compute_mi_cc(x, y, n_neighbors=n_neighbors)
+
+
+def _compute_mi_cc(x: npt.ArrayLike, y: npt.ArrayLike, n_neighbors: int) -> float:
+    """Computes mutual information `I(x; y)`. Random variables `x`, `y` are assumed
+    to be continuous.
+
+    Adapted from :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+    Args:
+        x, y:
+            Feature vectors each formatted as a numerical 1D array-like.
+        n_neighbors:
+            Number of neighbors to use for MI estimation for continuous variables,
+            see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+            could introduce a bias.
+    Returns:
+        Mutual information between feature vectors x and y.
+    """
+    n_samples = x.size
+
+    x = x.reshape((-1, 1))
+    y = y.reshape((-1, 1))
+
+    # Here we rely on NearestNeighbors to select the fastest algorithm.
+    radius = _get_radius_k_neighbours(np.hstack((x, y)), n_neighbors)
+
+    # KDTree is explicitly fit to allow for the querying of number of
+    # neighbors within a specified radius
+    nx = _num_points_within_radius(x, radius)
+    ny = _num_points_within_radius(y, radius)
+
+    mi = (
+        digamma(n_samples)
+        + digamma(n_neighbors)
+        - np.mean(digamma(nx))
+        - np.mean(digamma(ny))
+    )
+
+    return max(0, mi)
+
+
+def _compute_mi_cd(x: npt.ArrayLike, y: npt.ArrayLike, n_neighbors: int) -> float:
+    """Computes mutual information `I(x; y)`. Random variable `x`, is assumed
+    to be continuous, while `y` is assumed to be discrete.
+
+    Adapted from :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+    Args:
+        x, y:
+            Feature vectors each formatted as a numerical 1D array-like.
+        n_neighbors:
+            Number of neighbors to use for MI estimation for continuous variables,
+            see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+            could introduce a bias.
+    Returns:
+        Mutual information between feature vectors x and y.
+    """
+    radius, label_counts, k_all = _get_radius_k_neighbours_d(x, y, n_neighbors=n_neighbors)
+
+    # Ignore points with unique labels.
+    mask = label_counts > 1
+    n_samples = np.sum(mask)
+    label_counts = label_counts[mask]
+    k_all = k_all[mask]
+    x = x[mask]
+    radius = radius[mask]
+
+    m_all = _num_points_within_radius(x.reshape(-1, 1), radius)
+
+    mi = (
+        digamma(n_samples)
+        + np.mean(digamma(k_all))
+        - np.mean(digamma(label_counts))
+        - np.mean(digamma(m_all))
+    )
+
+    return max(0, mi)
+
+
+def _get_radius_k_neighbours(X: npt.ArrayLike, n_neighbors: int = 4) -> npt.ArrayLike:
+    """
+    Determine the smallest radius around `X` containing `n_neighbors` neighbors
+    Inspired by :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+
+    Args:
+        X:
+            Array of features.
+        n_neighbors:
+            Number of nearest neighbors.
+    Returns:
+        Vector of radii defined by the k nearest neighbors.
+    """
+    nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors)
+    return _get_radius_from_nn(X, nn)
+
+
+def _get_radius_k_neighbours_d(
+    c: npt.ArrayLike,
+    d: npt.ArrayLike,
+    n_neighbors: int = 4
+) -> typing.Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]:
+    """
+    Determine smallest radius around `c` and `d` containing `n_neighbors`.
+
+    Inspired by :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+    Args:
+        c:
+            Array of feature vectors.
+        d:
+            Vector of discrete features formatted as a numerical 1D array-like.
+        n_neighbors:
+            The number of nearest neighbors.
+    Returns:
+        radius:
+            Vector of radii defined by the k nearest neighbors.
+        label_counts:
+            Label counts of the discrete feature vector.
+        k_all:
+            Array of the number of points within a radius.
+    """
+    d = d.flatten()
+    n_samples = d.shape[0]
+    c = c.reshape((n_samples, -1))
+
+    radius = np.empty(n_samples)
+    label_counts = np.empty(n_samples)
+    k_all = np.empty(n_samples)
+    nn = NearestNeighbors(metric="chebyshev")
+    for label in np.unique(d):
+        mask = d == label
+        count = np.sum(mask)
+        if count > 1:
+            k = min(n_neighbors, count - 1)
+            nn.set_params(n_neighbors=k)
+            radius[mask] = _get_radius_from_nn(c[mask], nn)
+            k_all[mask] = k
+        label_counts[mask] = np.sum(mask)
+    return radius, label_counts, k_all
+
+
+def _get_radius_from_nn(X: npt.ArrayLike, nn: NearestNeighbors) -> npt.ArrayLike:
+    """
+    Get the radius of nearest neighbors in `nn` model with dataset `X`.
+
+    Inspired by :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+    Args:
+        X:
+            Array of features.
+        nn:
+            Instance of the nearest neighbors class.
+    Returns:
+        Vector of radii defined by the k nearest neighbors.
+    """
+    nn.fit(X)
+    r = nn.kneighbors()[0]
+    return np.nextafter(r[:, -1], 0)
+
+
+def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike) -> npt.ArrayLike:
+    """For each point, determine the number of other points within a given radius
+    Function from https://github.com/jannisteunissen/mutual_information
+    Args:
+        X:
+            (Continuous) feature array.
+        radius:
+            Vector of radii.
+    Returns:
+        Vector containing the number of points within a radius.
+    """
+    kd = KDTree(x, metric="chebyshev")
+    nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
+    return np.array(nx)
+
+
+def _num_points_within_radius_cd(
+        c: npt.ArrayLike,
+        d: npt.ArrayLike,
+        radius: npt.ArrayLike) -> npt.ArrayLike:
+    """
+    For each point, determine the number of other points within a given radius.
+
+    Inspired by :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+    Args:
+        c:
+            (Continuous) feature array.
+        d:
+            Discrete feature vector formatted as a numerical 1D array-like
+        radius:
+            Vector of radii.
+    Returns:
+        Vector containing the number of points within a radius.
+    """
+    c = c.reshape((-1, 1))
+    n_samples = c.shape[0]
+    m_all = np.empty(n_samples)
+    for label in np.unique(d):
+        mask = d == label
+        m_all[mask] = _num_points_within_radius(c[mask], radius[mask])
+    return m_all
+
+
+def _iterate_columns(X, columns=None):
+    """Iterate over columns of a matrix.
+    Copied from :func:`sklearn.feature_selection._mutual_info`.
+
+    Parameters
+    ----------
+    X : ndarray or csc_matrix, shape (n_samples, n_features)
+        Matrix over which to iterate.
+
+    columns : iterable or None, default=None
+        Indices of columns to iterate over. If None, iterate over all columns.
+
+    Yields
+    ------
+    x : ndarray, shape (n_samples,)
+        Columns of `X` in dense format.
+    """
+    if columns is None:
+        columns = range(X.shape[1])
+
+    if issparse(X):
+        for i in columns:
+            x = np.zeros(X.shape[0])
+            start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1]
+            x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr]
+            yield x
+    else:
+        for i in columns:
+            yield X[:, i]
diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py
index 01275d0..db444ee 100644
--- a/dwave/plugins/sklearn/transformers.py
+++ b/dwave/plugins/sklearn/transformers.py
@@ -20,22 +20,14 @@
 
 import dimod
 from dwave.cloud.exceptions import ConfigFileError, SolverAuthenticationError
-from dwave.plugins.sklearn.utilities import (
-    corrcoef,
-    _compute_cmi_distance,
-    _compute_mi,
-    _iterate_columns)
+from dwave.plugins.sklearn._conditional_mutual_info import estimate_mi_matrix
+from dwave.plugins.sklearn.utilities import corrcoef
 from dwave.system import LeapHybridCQMSampler
 import numpy as np
 import numpy.typing as npt
-from scipy.sparse import issparse
 from sklearn.base import BaseEstimator
 from sklearn.feature_selection import SelectorMixin
-from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
-from sklearn.preprocessing import scale
-from sklearn.utils import check_random_state
-from sklearn.utils.parallel import Parallel, delayed
-from sklearn.utils.validation import check_is_fitted, check_array, check_X_y
+from sklearn.utils.validation import check_is_fitted
 
 __all__ = ["SelectFromQuadraticModel"]
 
@@ -450,154 +442,3 @@ def _check_params(X: npt.ArrayLike,
 
         if X.shape[0] <= 1:
             raise ValueError("X must have at least two rows")
-
-
-def estimate_mi_matrix(
-    X: npt.ArrayLike,
-    y: npt.ArrayLike,
-    discrete_features: typing.Union[str, npt.ArrayLike] = "auto",
-    discrete_target: bool = False,
-    n_neighbors: int = 4,
-    conditional: bool = True,
-    copy: bool = True,
-    random_state: typing.Union[None, int] = None,
-    n_jobs: typing.Union[None, int] = None,
-) -> npt.ArrayLike:
-    """
-    For the feature array `X` and the target array `y` computes
-    the matrix of (conditional) mutual information interactions.
-    The matrix is defined as follows:
-
-    `M_(i, i) = I(x_i; y)`
-
-    If `conditional = True`, then the off-diagonal terms are computed:
-
-    `M_(i, j) = (I(x_i; y| x_j) + I(x_j; y| x_i)) / 2`
-
-    Otherwise
-
-    `M_(i, j) = I(x_i; x_j)`
-
-    Computation of I(x; y) uses modified scikit-learn methods.
-    The computation of I(x; y| z) is based on
-    https://github.com/jannisteunissen/mutual_information and [3]_.
-
-    The method can be computationally expensive for a large number of features (> 1000) and
-    a large number of samples (> 100000). In this case, it can be advisable to downsample the
-    dataset.
-
-    The estimation methods are linear in the number of outcomes (labels) of discrete distributions.
-    It may be beneficial to treat the discrete distrubitions with a large number of labels as
-    continuous distributions.
-
-    Args:
-        X: See :func:`sklearn.feature_selection.mutual_info_regression`
-
-        y: See :func:`sklearn.feature_selection.mutual_info_regression`
-
-        conditional: bool, default=True
-            Whether to compute the off-diagonal terms using the conditional mutual
-            information or joint mutual information
-
-        discrete_features: See :func:`sklearn.feature_selection.mutual_info_regression`
-
-        discrete_target: bool, default=False
-            Whether the target variable `y` is discrete
-
-        n_neighbors: See :func:`sklearn.feature_selection.mutual_info_regression`
-
-        copy: See :func:`sklearn.feature_selection.mutual_info_regression`
-
-        random_state: See :func:`sklearn.feature_selection.mutual_info_regression`
-
-        n_jobs: int, default=None
-            The number of parallel jobs to run for the conditional mutual information
-            computation. The parallelization is over the columns of `X`.
-            ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
-            ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
-            for more details.
-
-    Returns:
-        mi_matrix : ndarray, shape (n_features, n_features)
-        Interaction matrix between the features using (conditional) mutual information.
-        A negative value will be replaced by 0.
-
-    References:
-    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
-           information". Phys. Rev. E 69, 2004.
-    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
-           Data Sets". PLoS ONE 9(2), 2014.
-    .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional
-           mutual information estimation for mixed, discrete and continuous
-           data." IEEE Transactions on Information Theory 67.1 (2020): 464-484.
-    """
-    X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target)
-    n_samples, n_features = X.shape
-
-    if isinstance(discrete_features, (str, bool)):
-        if isinstance(discrete_features, str):
-            if discrete_features == "auto":
-                discrete_features = issparse(X)
-            else:
-                raise ValueError("Invalid string value for discrete_features.")
-        discrete_mask = np.empty(n_features, dtype=bool)
-        discrete_mask.fill(discrete_features)
-    else:
-        discrete_features = check_array(discrete_features, ensure_2d=False)
-        if np.issubdtype(discrete_features.dtype, bool):
-            discrete_mask = np.zeros(n_features, dtype=bool)
-            discrete_mask[discrete_features] = True
-        else:
-            discrete_mask = discrete_features
-
-    continuous_mask = ~discrete_mask
-    if np.any(continuous_mask) and issparse(X):
-        raise ValueError("Sparse matrix `X` can't have continuous features.")
-
-    rng = check_random_state(random_state)
-    if np.any(continuous_mask):
-        X = X.astype(np.float64, copy=copy)
-        X[:, continuous_mask] = scale(
-            X[:, continuous_mask], with_mean=False, copy=False
-        )
-
-        # Add small noise to continuous features as advised in Kraskov et. al.
-        means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))
-        X[:, continuous_mask] += (
-            1e-10
-            * means
-            * rng.standard_normal(size=(n_samples, np.sum(continuous_mask)))
-        )
-
-    if not discrete_target:
-        y = scale(y, with_mean=False)
-        y += (
-            1e-10
-            * np.maximum(1, np.mean(np.abs(y)))
-            * rng.standard_normal(size=n_samples)
-        )
-
-    mi_matrix = np.zeros((n_features, n_features), dtype=np.float64)
-    # Computing the diagonal terms
-    diagonal_vals = Parallel(n_jobs=n_jobs)(
-        delayed(_compute_mi)(x, y, discrete_feature, discrete_target, n_neighbors)
-        for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
-    )
-    np.fill_diagonal(mi_matrix, diagonal_vals)
-    # Computing the off-diagonal terms
-    off_diagonal_iter = combinations(zip(_iterate_columns(X), discrete_mask), 2)
-    if conditional:
-        off_diagonal_vals = Parallel(n_jobs=n_jobs)(
-            delayed(_compute_cmi_distance)
-            (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors)
-            for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter)
-    else:
-        off_diagonal_vals = Parallel(n_jobs=n_jobs)(
-            delayed(_compute_mi)
-            (xi, xj, discrete_feature_i, discrete_feature_j, n_neighbors)
-            for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter)
-
-    off_diagonal_val = iter(off_diagonal_vals)
-    for i, j in combinations(range(n_features), 2):
-        mi_matrix[i, j] = mi_matrix[j, i] = next(off_diagonal_val)
-    return mi_matrix
diff --git a/dwave/plugins/sklearn/utilities.py b/dwave/plugins/sklearn/utilities.py
index bbdf956..caccfcd 100644
--- a/dwave/plugins/sklearn/utilities.py
+++ b/dwave/plugins/sklearn/utilities.py
@@ -51,11 +51,6 @@
 
 import numpy as np
 import numpy.typing as npt
-from scipy.sparse import issparse
-from scipy.special import digamma
-from scipy.stats import entropy
-from sklearn.metrics.cluster import mutual_info_score
-from sklearn.neighbors import NearestNeighbors, KDTree
 
 __all__ = ["corrcoef", "cov", "dot_2d"]
 
@@ -229,385 +224,3 @@ def dot_2d(a: npt.ArrayLike, b: npt.ArrayLike, *,
             out.flush()
 
     return out
-
-
-def _compute_cmi_distance(
-    xi: npt.ArrayLike,
-    xj: npt.ArrayLike,
-    y: npt.ArrayLike,
-    discrete_feature_i: bool,
-    discrete_feature_j: bool,
-    discrete_target: bool,
-    n_neighbors: int = 4,
-):
-    """
-    Computing a distance `d` based on the conditional mutual infomation
-    between features `x_i` and `x_j`: `d = (I(x_i; y | x_j)+I(x_j; y | x_i))/2`.
-
-    References
-    ----------
-    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
-           information". Phys. Rev. E 69, 2004.
-    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
-           Data Sets". PLoS ONE 9(2), 2014.
-    .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional
-           mutual information estimation for mixed, discrete and continuous
-           data." IEEE Transactions on Information Theory 67.1 (2020): 464-484.
-    """
-    if discrete_feature_i and discrete_feature_j and discrete_target:
-        ans = _compute_cmip_d(xi, xj, y)
-    elif discrete_target:
-        ans = _compute_cmip_ccd(xi, xj, y, n_neighbors)
-    elif discrete_feature_i and (not discrete_target):
-        ans = _compute_cmip_cdc(xj, xi, y, n_neighbors)
-    elif (not discrete_feature_i) and discrete_feature_j and (not discrete_target):
-        ans = _compute_cmip_cdc(xi, xj, y, n_neighbors)
-    else:
-        ans = _compute_cmip_c(xi, xj, y, n_neighbors)
-    return np.mean(ans)
-
-
-def _compute_cmip_d(
-        xi: npt.ArrayLike,
-        xj: npt.ArrayLike,
-        y: npt.ArrayLike) -> typing.Tuple[float]:
-    """
-    Computes conditional mutual infomation pair `I(x_i; y | x_j)` and `I(x_j; y | x_i)`.
-    All random variables `x_i`, `x_j` and `y` are discrete.
-
-    Adpated from https://github.com/dwave-examples/mutual-information-feature-selection
-    """
-    # Computing joint probability distribution for the features and the target
-    bin_boundaries = [np.hstack((np.unique(col), np.inf)) for col in (xi, xj, y)]
-    prob, _ = np.histogramdd(np.vstack((xi, xj, y)).T, bins=bin_boundaries)
-
-    # Computing entropy
-    Hijy = entropy(prob.flatten())  # H(x_i, x_j, y)
-    Hij = entropy(prob.sum(axis=2).flatten())  # H(x_i, x_j)
-    cmi_ij = (
-        Hij-Hijy
-        + entropy(prob.sum(axis=0).flatten())  # H(x_j, y)
-        - entropy(prob.sum(axis=(0, 2)))  # H(x_j)
-    )
-    cmi_ji = (
-        Hij-Hijy
-        + entropy(prob.sum(axis=1).flatten())  # H(x_i, y)
-        - entropy(prob.sum(axis=(1, 2)))  # H(x_i)
-    )
-    return cmi_ij, cmi_ji
-
-
-def _compute_cmip_c(
-        xi: npt.ArrayLike,
-        xj: npt.ArrayLike,
-        y: npt.ArrayLike,
-        n_neighbors: int = 4) -> typing.Tuple[float]:
-    """
-    Computes conditional mutual information pair `I(x_i; y | x_j)`
-    and `I(x_j; y | x_i)`. All random variables `x_i`, `x_j` and `y`
-    are assumed to be continuous.
-
-    Adapted from https://github.com/jannisteunissen/mutual_information
-    """
-    xi = xi.reshape((-1, 1))
-    xj = xj.reshape((-1, 1))
-    y = y.reshape((-1, 1))
-
-    # Here we rely on NearestNeighbors to select the fastest algorithm.
-    radius = _get_radius_k_neighbours(np.hstack((xi, xj, y)),
-                                      n_neighbors=n_neighbors)
-
-    # KDTree is explicitly fit to allow for the querying of number of
-    # neighbors within a specified radius
-    n_j = _num_points_within_radius(xj, radius)
-    n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius)
-    n_jy = _num_points_within_radius(np.hstack((y, xj)), radius)
-
-    n_i = _num_points_within_radius(xi, radius)
-    n_iy = _num_points_within_radius(np.hstack((y, xi)), radius)
-
-    return _get_cmi_pair_from_estimates([n_neighbors], n_ij, n_j, n_jy, n_i, n_iy)
-
-
-def _compute_cmip_ccd(
-        xi: npt.ArrayLike,
-        xj: npt.ArrayLike,
-        y: npt.ArrayLike,
-        n_neighbors: int = 4) -> typing.Tuple[float]:
-    """
-    Computes conditional mutual information pair `I(x_i; y | x_j)`
-    and `I(x_j; y | x_i)`. Random variables `x_i`, `x_j` are assumed
-    to be continuous, while `y` is assumed to be discrete.
-
-    Adapted from https://github.com/jannisteunissen/mutual_information and
-    :func:`sklearn.feature_selection.mutual_info_regression`
-    """
-    xi = xi.reshape((-1, 1))
-    xj = xj.reshape((-1, 1))
-
-    # Here we rely on NearestNeighbors to select the fastest algorithm.
-    radius, label_counts, k_all = _get_radius_k_neighbours_d(np.hstack((xi, xj)), y,
-                                                             n_neighbors=n_neighbors)
-
-    # Ignore points with unique labels.
-    mask = label_counts > 1
-    label_counts = label_counts[mask]
-    k_all = k_all[mask]
-    xi = xi[mask]
-    xj = xj[mask]
-    y = y[mask]
-    radius = radius[mask]
-    # KDTree is explicitly fit to allow for the querying of number of
-    # neighbors within a specified radius
-    # continuous
-    n_i = _num_points_within_radius(xi, radius)
-    n_j = _num_points_within_radius(xj, radius)
-    n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius)
-    # mixed discrete/continuous estimates
-    n_iy = _num_points_within_radius_cd(xi, y, radius)
-    n_jy = _num_points_within_radius_cd(xj, y, radius)
-
-    return _get_cmi_pair_from_estimates(k_all, n_ij, n_j, n_jy, n_i, n_iy)
-
-
-def _compute_cmip_cdc(
-        xi: npt.ArrayLike,
-        xj: npt.ArrayLike,
-        y: npt.ArrayLike,
-        n_neighbors: int = 4) -> typing.Tuple[float]:
-    """
-    Computes conditional mutual information pair `I(x_i; y | x_j)`
-    and `I(x_j; y | x_i)`. Random variables `x_i`, `y` are assumed
-    to be continuous, while `x_j` is assumed to be discrete.
-
-    Adapted from https://github.com/jannisteunissen/mutual_information and
-    :func:`sklearn.feature_selection.mutual_info_regression`
-    """
-    xi = xi.reshape((-1, 1))
-    y = y.reshape((-1, 1))
-
-    # Here we rely on NearestNeighbors to select the fastest algorithm.
-    radius, label_counts, k_all = _get_radius_k_neighbours_d(np.hstack((xi, y)), xj,
-                                                             n_neighbors=n_neighbors)
-
-    # Ignore points with unique labels.
-    mask = label_counts > 1
-    label_counts = label_counts[mask]
-    k_all = k_all[mask]
-    xi = xi[mask]
-    xj = xj[mask]
-    y = y[mask]
-    radius = radius[mask]
-
-    # KDTree is explicitly fit to allow for the querying of number of
-    # neighbors within a specified radius
-    # continuous
-    n_i = _num_points_within_radius(xi, radius)
-    n_iy = _num_points_within_radius(np.hstack((xi, y)), radius)
-    # mixed coninuous/discrete estimates
-    n_ij = _num_points_within_radius_cd(xi, xj, radius)
-    n_jy = _num_points_within_radius_cd(y, xj, radius)
-    # discrete estimates
-    n_j = label_counts
-
-    return _get_cmi_pair_from_estimates(k_all, n_ij, n_j, n_jy, n_i, n_iy)
-
-
-def _get_cmi_pair_from_estimates(
-        k_all: npt.ArrayLike, n_ij: npt.ArrayLike, n_j: npt.ArrayLike,
-        n_jy: npt.ArrayLike, n_i: npt.ArrayLike, n_iy: npt.ArrayLike) -> typing.Tuple[float]:
-
-    common_terms = np.mean(digamma(k_all))-np.mean(digamma(n_ij))
-    cmi_ij = common_terms+np.mean(digamma(n_j))-np.mean(digamma(n_jy))
-    cmi_ji = common_terms+np.mean(digamma(n_i))-np.mean(digamma(n_iy))
-    return max(0, cmi_ij), max(0, cmi_ji)
-
-
-def _compute_mi(
-    x: npt.ArrayLike,
-    y: npt.ArrayLike,
-    discrete_feature_x: bool,
-    discrete_feature_y: bool,
-    n_neighbors: int = 4,
-):
-    """
-    Compute mutual information between features `I(x_i; x_j)`.
-
-    References
-    ----------
-    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
-           information". Phys. Rev. E 69, 2004.
-    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
-           Data Sets". PLoS ONE 9(2), 2014.
-    """
-    if discrete_feature_y and discrete_feature_x:
-        return mutual_info_score(x, y)
-    elif discrete_feature_x:
-        return _compute_mi_cd(y, x, n_neighbors=n_neighbors)
-    elif discrete_feature_y:
-        return _compute_mi_cd(x, y, n_neighbors=n_neighbors)
-    else:
-        return _compute_mi_cc(x, y, n_neighbors=n_neighbors)
-
-
-def _compute_mi_cc(x: npt.ArrayLike, y: npt.ArrayLike, n_neighbors: int) -> float:
-    """Computes mutual information `I(x; y)`. Random variables `x`, `y` are assumed
-    to be continuous.
-
-    Adapted from :func:`sklearn.feature_selection.mutual_info_regression`
-    """
-    n_samples = x.size
-
-    x = x.reshape((-1, 1))
-    y = y.reshape((-1, 1))
-
-    # Here we rely on NearestNeighbors to select the fastest algorithm.
-    radius = _get_radius_k_neighbours(np.hstack((x, y)), n_neighbors)
-
-    # KDTree is explicitly fit to allow for the querying of number of
-    # neighbors within a specified radius
-    nx = _num_points_within_radius(x, radius)
-    ny = _num_points_within_radius(y, radius)
-
-    mi = (
-        digamma(n_samples)
-        + digamma(n_neighbors)
-        - np.mean(digamma(nx))
-        - np.mean(digamma(ny))
-    )
-
-    return max(0, mi)
-
-
-def _compute_mi_cd(x: npt.ArrayLike, y: npt.ArrayLike, n_neighbors: int) -> float:
-    """Computes mutual information `I(x; y)`. Random variable `x`, is assumed
-    to be continuous, while `y` is assumed to be discrete.
-
-    Adapted from :func:`sklearn.feature_selection.mutual_info_regression`
-    """
-    radius, label_counts, k_all = _get_radius_k_neighbours_d(x, y, n_neighbors=n_neighbors)
-
-    # Ignore points with unique labels.
-    mask = label_counts > 1
-    n_samples = np.sum(mask)
-    label_counts = label_counts[mask]
-    k_all = k_all[mask]
-    x = x[mask]
-    radius = radius[mask]
-
-    m_all = _num_points_within_radius(x.reshape(-1, 1), radius)
-
-    mi = (
-        digamma(n_samples)
-        + np.mean(digamma(k_all))
-        - np.mean(digamma(label_counts))
-        - np.mean(digamma(m_all))
-    )
-
-    return max(0, mi)
-
-
-def _get_radius_k_neighbours(X: npt.ArrayLike, n_neighbors: int = 4) -> npt.ArrayLike:
-    """
-    Determine smallest radius around `X` containing `n_neighbors` neighbors
-    Inspired by :func:`sklearn.feature_selection.mutual_info_regression`
-    """
-    nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors)
-    return _get_radius_from_nn(X, nn)
-
-
-def _get_radius_k_neighbours_d(
-    c: npt.ArrayLike,
-    d: npt.ArrayLike,
-    n_neighbors: int = 4
-) -> typing.Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]:
-    """
-    Determine smallest radius around `c` and `d` containing `n_neighbors` neighbors
-    Inspired by :func:`sklearn.feature_selection.mutual_info_regression`
-    """
-    d = d.flatten()
-    n_samples = d.shape[0]
-    c = c.reshape((n_samples, -1))
-
-    radius = np.empty(n_samples)
-    label_counts = np.empty(n_samples)
-    k_all = np.empty(n_samples)
-    nn = NearestNeighbors(metric="chebyshev")
-    for label in np.unique(d):
-        mask = d == label
-        count = np.sum(mask)
-        if count > 1:
-            k = min(n_neighbors, count - 1)
-            nn.set_params(n_neighbors=k)
-            radius[mask] = _get_radius_from_nn(c[mask], nn)
-            k_all[mask] = k
-        label_counts[mask] = np.sum(mask)
-    return radius, label_counts, k_all
-
-
-def _get_radius_from_nn(X: npt.ArrayLike, nn: NearestNeighbors) -> npt.ArrayLike:
-    nn.fit(X)
-    r = nn.kneighbors()[0]
-    return np.nextafter(r[:, -1], 0)
-
-
-def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike) -> npt.ArrayLike:
-    """For each point, determine the number of other points within a given radius
-    Function from https://github.com/jannisteunissen/mutual_information
-
-    :param x: ndarray, shape (n_samples, n_dim)
-    :param radius: radius, shape (n_samples,)
-    :returns: number of points within radius
-
-    """
-    kd = KDTree(x, metric="chebyshev")
-    nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
-    return np.array(nx)
-
-
-def _num_points_within_radius_cd(
-        c: npt.ArrayLike,
-        d: npt.ArrayLike,
-        radius: npt.ArrayLike) -> npt.ArrayLike:
-    """
-    For each point, determine the number of other points within a given radius
-    Inspired by :func:`sklearn.feature_selection.mutual_info_regression`
-    """
-    c = c.reshape((-1, 1))
-    n_samples = c.shape[0]
-    m_all = np.empty(n_samples)
-    for label in np.unique(d):
-        mask = d == label
-        m_all[mask] = _num_points_within_radius(c[mask], radius[mask])
-    return m_all
-
-
-def _iterate_columns(X, columns=None):
-    """Iterate over columns of a matrix.
-    Copied from :func:`sklearn.feature_selection._mutual_info`
-
-    Parameters
-    ----------
-    X : ndarray or csc_matrix, shape (n_samples, n_features)
-        Matrix over which to iterate.
-
-    columns : iterable or None, default=None
-        Indices of columns to iterate over. If None, iterate over all columns.
-
-    Yields
-    ------
-    x : ndarray, shape (n_samples,)
-        Columns of `X` in dense format.
-    """
-    if columns is None:
-        columns = range(X.shape[1])
-
-    if issparse(X):
-        for i in columns:
-            x = np.zeros(X.shape[0])
-            start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1]
-            x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr]
-            yield x
-    else:
-        for i in columns:
-            yield X[:, i]
diff --git a/tests/test_mutual_info.py b/tests/test_mutual_info.py
new file mode 100644
index 0000000..45000b0
--- /dev/null
+++ b/tests/test_mutual_info.py
@@ -0,0 +1,369 @@
+
+# Tests in TestMI are adapted from sklearn tests and licenced under:
+# BSD 3-Clause License
+#
+# Copyright (c) 2007-2024 The scikit-learn developers.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Tests in TestCMI are adapted from https://github.com/jannisteunissen/mutual_information and
+# distributed under the following licence:
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2022, Jannis Teunissen
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The modifications of the test and the tests in TestDCMI are distributed
+# under the Apache 2.0 Software license:
+#
+# Copyright 2023 D-Wave Systems Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from dwave.plugins.sklearn._conditional_mutual_info import (
+    _compute_mi,
+    _compute_cmip_c, _compute_cmip_d,
+    _compute_cmip_cdc, _compute_cmip_ccd
+)
+
+from numpy.linalg import det
+from sklearn.feature_selection import mutual_info_classif
+from sklearn.preprocessing import scale
+from sklearn.utils import check_random_state
+
+
+class TestMI(unittest.TestCase):
+    def test_against_sklearn(self):
+        # the test will break if sklearn modifies its implementation
+        from sklearn.feature_selection._mutual_info import _compute_mi_cd, _compute_mi_cc
+        from sklearn.metrics.cluster import mutual_info_score
+        # Test discrete implementation
+        n_samples, n_ss, n_neighbors, random_state = 103, 51, 4, 15
+        np.random.seed(random_state)
+        xi = np.random.randint(0, 11, (n_samples, ))
+        xj = np.hstack((
+            np.random.randint(-2, 1, (n_ss, )),
+            np.random.randint(10, 14, (n_samples-n_ss, ))))
+        np.random.shuffle(xi)
+        np.random.shuffle(xj)
+        xi_c = scale(np.random.randn(n_samples), with_mean=False)
+        xj_c = scale(np.random.randn(n_samples), with_mean=False)
+
+        mi_ij = _compute_mi(
+            xi, xj, True, True, n_neighbors=n_neighbors)
+        mi_ij_cl = mutual_info_score(xi, xj)
+        self.assertTrue(
+            np.allclose(mi_ij_cl, mi_ij),
+            msg="Discrete mutual information is not computed correctly")
+
+        mi_ij = _compute_mi(
+            xi_c, xj, False, True, n_neighbors=n_neighbors)
+        mi_ij_cl = _compute_mi_cd(xi_c, xj, n_neighbors=n_neighbors)
+        self.assertTrue(
+            np.allclose(mi_ij_cl, mi_ij, atol=1e-5),
+            msg=f"Error in continuous features/discrete target MI estimation is larger than expected {abs(mi_ij_cl-mi_ij)}")
+
+        mi_ij = _compute_mi(xi_c, xj_c, False, False, n_neighbors=n_neighbors)
+        mi_ij_cl = _compute_mi_cc(xi_c, xj_c, n_neighbors=n_neighbors)
+        self.assertTrue(
+            np.allclose(mi_ij_cl, mi_ij, atol=1e-5),
+            msg=f"Error in purely continuous MI is larger than expected {abs(mi_ij_cl-mi_ij)}")
+
+    def test_compute_mi_dd(self):
+        # In discrete case computations are straightforward and can be done
+        # by hand on given vectors.
+        x = np.array([0, 1, 1, 0, 0])
+        y = np.array([1, 0, 0, 0, 1])
+
+        H_x = H_y = -(3 / 5) * np.log(3 / 5) - (2 / 5) * np.log(2 / 5)
+        H_xy = -1 / 5 * np.log(1 / 5) - 2 / 5 * np.log(2 / 5) - 2 / 5 * np.log(2 / 5)
+        I_xy = H_x + H_y - H_xy
+
+        self.assertAlmostEqual(_compute_mi(x, y, discrete_feature_x=True,
+                               discrete_feature_y=True), I_xy, places=4)
+
+    def test_compute_mi_cc(self):
+        # For two continuous variables a good approach is to test on bivariate
+        # normal distribution, where mutual information is known.
+
+        # Mean of the distribution, irrelevant for mutual information.
+        mean = np.zeros(2)
+
+        # Setup covariance matrix with correlation coeff. equal 0.5.
+        sigma_1 = 1
+        sigma_2 = 10
+        corr = 0.5
+        cov = np.array(
+            [
+                [sigma_1**2, corr * sigma_1 * sigma_2],
+                [corr * sigma_1 * sigma_2, sigma_2**2],
+            ]
+        )
+
+        # True theoretical mutual information.
+        I_theory = np.log(sigma_1) + np.log(sigma_2) - 0.5 * np.log(np.linalg.det(cov))
+
+        rng = check_random_state(0)
+        Z = rng.multivariate_normal(mean, cov, size=5001).astype(np.float64, copy=False)
+
+        x, y = Z[:, 0], Z[:, 1]
+
+        # Theory and computed values won't be very close
+        # We here check with a large relative tolerance
+        for n_neighbors in [4, 6, 8]:
+            I_computed = _compute_mi(
+                x, y, discrete_feature_x=False, discrete_feature_y=False, n_neighbors=n_neighbors
+            )
+            self.assertLessEqual(abs(I_computed/I_theory-1.0), 1e-1)
+
+    def test_compute_mi_cd(self):
+        # To test define a joint distribution as follows:
+        # p(x, y) = p(x) p(y | x)
+        # X ~ Bernoulli(p)
+        # (Y | x = 0) ~ Uniform(-1, 1)
+        # (Y | x = 1) ~ Uniform(0, 2)
+
+        # Use the following formula for mutual information:
+        # I(X; Y) = H(Y) - H(Y | X)
+        # Two entropies can be computed by hand:
+        # H(Y) = -(1-p)/2 * ln((1-p)/2) - p/2*log(p/2) - 1/2*log(1/2)
+        # H(Y | X) = ln(2)
+
+        # Now we need to implement sampling from out distribution, which is
+        # done easily using conditional distribution logic.
+
+        n_samples = 5001
+        rng = check_random_state(0)
+
+        for p in [0.3, 0.5, 0.7]:
+            x = rng.uniform(size=n_samples) > p
+
+            y = np.empty(n_samples, np.float64)
+            mask = x == 0
+            y[mask] = rng.uniform(-1, 1, size=np.sum(mask))
+            y[~mask] = rng.uniform(0, 2, size=np.sum(~mask))
+
+            I_theory = -0.5 * (
+                (1 - p) * np.log(0.5 * (1 - p)) + p * np.log(0.5 * p) + np.log(0.5)
+            ) - np.log(2)
+
+            # Assert the same tolerance.
+            for n_neighbors in [4, 6, 8]:
+                I_computed = _compute_mi(
+                    x, y, discrete_feature_x=True, discrete_feature_y=False, n_neighbors=n_neighbors
+                )
+                self.assertLessEqual(abs(I_computed/I_theory-1.0), 1e-1)
+
+    def test_compute_mi_cd_unique_label(self):
+        # Test that adding unique label doesn't change MI.
+        n_samples = 100
+        x = np.random.uniform(size=n_samples) > 0.5
+
+        y = np.empty(n_samples, np.float64)
+        mask = x == 0
+        y[mask] = np.random.uniform(-1, 1, size=np.sum(mask))
+        y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask))
+
+        mi_1 = _compute_mi(x, y, discrete_feature_x=True, discrete_feature_y=False)
+
+        x = np.hstack((x, 2))
+        y = np.hstack((y, 10))
+        mi_2 = _compute_mi(x, y, discrete_feature_x=True, discrete_feature_y=False)
+
+        self.assertAlmostEqual(mi_1, mi_2, places=5)
+
+
+class TestCMI(unittest.TestCase):
+    def test_discrete(self):
+        # Third test from "Conditional Mutual Information Estimation for Mixed Discrete
+        # and Continuous Variables with Nearest Neighbors" (Mesner & Shalizi). Fully
+        # discrete
+        def rescale(xi):
+            xi = scale(xi + 0.0, with_mean=False, copy=False)
+            # Add small noise to continuous features as advised in Kraskov et. al.
+            xi = xi.astype(np.float64, copy=False)
+            means = np.maximum(1, np.mean(np.abs(xi), axis=0))
+            xi += (1e-10 * means * rng.standard_normal(size=(xi.shape[0], )))
+            return xi
+        N = 1001
+        rng = check_random_state(0)
+        choices = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1]])
+        p_discrete = rng.choice(4, p=[0.4, 0.4, 0.1, 0.1], size=N)
+        xy_discrete = choices[p_discrete]
+        x, y = xy_discrete[:, 0], xy_discrete[:, 1]
+        z = rng.poisson(2, size=N)
+        x_c = rescale(x)
+        y_c = rescale(y)
+        z_c = rescale(z)
+        cmi_analytic = 2 * 0.4 * np.log(0.4/0.5**2) + 2 * 0.1 * np.log(0.1/0.5**2)
+        # checking the computations with a large relative tolerance
+        for n_neighbors in [3, 5, 7]:
+            cmi_ij, _ = _compute_cmip_d(x, z, y)
+            self.assertLessEqual(abs(cmi_ij/cmi_analytic-1.0), 1e-1)
+            cmi_ij, _ = _compute_cmip_c(x_c, z_c, y_c, n_neighbors)
+            self.assertLessEqual(abs(cmi_ij/cmi_analytic-1.0), 1e-1)
+            cmi_ij, _ = _compute_cmip_ccd(x_c, z_c, y, n_neighbors)
+            self.assertLessEqual(abs(cmi_ij/cmi_analytic-1.0), 1e-1)
+            cmi_ij, _ = _compute_cmip_cdc(x_c, z, y_c, n_neighbors)
+            self.assertLessEqual(abs(cmi_ij/cmi_analytic-1.0), 1e-1)
+
+    def test_bivariate(self):
+        """Test with bivariate normal variables"""
+        N = 1001
+        rng = check_random_state(0)
+        mu = np.zeros(2)
+        cov = np.array([[1., 0.8], [0.8, 1.0]])
+        xy_gauss = rng.multivariate_normal(mu, cov, size=N)
+        x, y = xy_gauss[:, 0], xy_gauss[:, 1]
+        z = rng.normal(size=N)
+
+        cmi_analytic = -0.5 * np.log(det(cov))
+        # checking the computations with a large relative tolerance
+        for n_neighbors in [4, 6, 8]:
+            cmi_ij, _ = _compute_cmip_c(x, z, y, n_neighbors)
+            self.assertAlmostEqual(cmi_ij/cmi_analytic, 1.0, places=1)
+
+    def test_trivariate(self):
+        # Test with 'trivariate' normal variables x, y, z
+
+        mu = np.zeros(3)
+
+        # Covariance matrix
+        cov_xy = 0.7
+        cov_xz = 0.5
+        cov_yz = 0.3
+        cov = np.array([[1, cov_xy, cov_xz],
+                        [cov_xy, 1.0, cov_yz],
+                        [cov_xz, cov_yz, 1]])
+        N = 1001
+        rng = check_random_state(0)
+        samples = rng.multivariate_normal(mu, cov, size=N)
+        x, y, z = samples[:, 0], samples[:, 1], samples[:, 2]
+
+        # Construct minor matrices for x and y
+        cov_x = cov[1:, 1:]
+        cov_y = cov[[0, 2]][:, [0, 2]]
+        cmi_analytic = -0.5 * np.log(det(cov) / (det(cov_x) * det(cov_y)))
+        # checking the computations with a large relative tolerance
+        for n_neighbors in [4, 6, 8]:
+            cmi_ij, _ = _compute_cmip_c(x, z, y, n_neighbors)
+            self.assertAlmostEqual(cmi_ij/cmi_analytic, 1, places=1)
+
+
+class TestDCMI(unittest.TestCase):
+    def test_cmi_symmetry(self):
+        # The pair produced by the computations should be symmetric in the first two arguments
+        np.random.seed(42)
+        n_samples, n_neighbors = 4003, 4
+        # Test continuous implementation
+        Xy = np.random.randn(n_samples, 3)
+        cmi_ij_pair = _compute_cmip_c(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors)
+        cmi_ji_pair = _compute_cmip_c(Xy[:, 1], Xy[:, 0], Xy[:, 2], n_neighbors=n_neighbors)
+        self.assertAlmostEqual(
+            sum(cmi_ij_pair), sum(cmi_ji_pair), places=3,
+            msg="Computation for continuous conditional mutual information is not symmetric")
+
+        c1 = np.random.randn(n_samples,)
+        c2 = np.random.randn(n_samples,)
+        n_ss = 1001
+        d = np.hstack((
+            np.random.randint(-2, 1, (n_ss, )),
+            np.random.randint(10, 14, (n_samples-n_ss, ))))
+        np.random.shuffle(d)
+        cmi_ij_pair = _compute_cmip_ccd(c1, c2, d, n_neighbors=n_neighbors)
+        cmi_ji_pair = _compute_cmip_ccd(c2, c1, d, n_neighbors=n_neighbors)
+        self.assertAlmostEqual(
+            sum(cmi_ij_pair), sum(cmi_ji_pair), places=3,
+            msg="Computation for mixed continuous/discrete conditional mutual information is not symmetric")
+
+    def test_cmi_discrete(self):
+        # We test the algorithm using the formula `I(x;y|z) = I(x;y) + I(z;y) - I(z;y|x)`.
+        # It is highly likely if our algorithm is correct, then formula holds.
+        np.random.seed(42)
+        # Test discrete implementation
+        n_samples, n_ss, n_neighbors = 103, 51, 4
+        xi = np.random.randint(0, 11, (n_samples, ))
+        xj = np.hstack((
+            np.random.randint(-2, 1, (n_ss, )),
+            np.random.randint(10, 14, (n_samples-n_ss, ))))
+        np.random.shuffle(xi)
+        np.random.shuffle(xj)
+        y = np.random.randint(-12, -10, (n_samples, ))
+        cmi_ij, cmi_ji = _compute_cmip_d(xi, xj, y)
+        # we estimate mutual information using
+        mi_i = mutual_info_classif(
+            xi.reshape(-1, 1), y, discrete_features=True, n_neighbors=n_neighbors)
+        mi_j = mutual_info_classif(
+            xj.reshape(-1, 1), y, discrete_features=True, n_neighbors=n_neighbors)
+
+        self.assertTrue(
+            np.allclose(cmi_ij + mi_j - mi_i, cmi_ji, atol=1e-5),
+            msg="The chain rule for discrete conditional mutual information is violated")
+        cmi_ij2 = _compute_cmip_d(xj, xi, y)
+        self.assertTrue(
+            np.allclose(sum(cmi_ij2), cmi_ji+cmi_ij, atol=1e-5),
+            msg="Discrete conditional mutual information computation is not symmetric")
diff --git a/tests/test_utilities.py b/tests/test_utilities.py
index 364a64b..5b469e5 100644
--- a/tests/test_utilities.py
+++ b/tests/test_utilities.py
@@ -50,16 +50,7 @@
 
 import numpy as np
 
-from dwave.plugins.sklearn.utilities import (
-    corrcoef, cov, dot_2d,
-    _compute_mi,
-    _compute_cmip_c, _compute_cmip_d,
-    _compute_cmip_cdc, _compute_cmip_ccd
-)
-from scipy.special import digamma
-from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
-from sklearn.neighbors import KDTree
-from sklearn.preprocessing import scale
+from dwave.plugins.sklearn.utilities import corrcoef, cov, dot_2d
 
 
 class TestCorrCoef(unittest.TestCase):
@@ -186,186 +177,3 @@ def test_memmap(self):
                 out = np.memmap(fout, "float64", mode="w+", shape=(X.shape[0], X.shape[0]))
 
                 dot_2d(X, X.T, out=out)
-
-
-class TestMI(unittest.TestCase):
-    def test_cmi_mixed(self):
-        np.random.seed(42)
-        n_samples, n_neighbors = 4003, 4
-        c1 = np.random.randn(n_samples,)
-        n_ss = 1001
-        d1 = np.hstack((
-            np.random.randint(-2, 1, (n_ss, )),
-            np.random.randint(10, 14, (n_samples-n_ss, ))))
-        d2 = np.random.randint(0, 4, (n_samples, ))
-        np.random.shuffle(d2)
-        cmi_ij_pair = _compute_cmip_c(c1, d1, d2, n_neighbors=n_neighbors)
-        cmi_ij_pair_cdc = _compute_cmip_cdc(c1, d1, d2, n_neighbors=n_neighbors)
-        cmi_ij_pair_ccd = _compute_cmip_ccd(c1, d1, d2, n_neighbors=n_neighbors)
-        self.assertAlmostEqual(
-            sum(cmi_ij_pair), sum(cmi_ij_pair_cdc), places=4,
-            msg="Computation for mixed continuous/discrete conditional mutual information is not accurate")
-        self.assertAlmostEqual(
-            sum(cmi_ij_pair), sum(cmi_ij_pair_ccd), places=4,
-            msg="Computation for mixed continuous/discrete conditional mutual information is not accurate")
-
-    def test_cmi_symmetry(self):
-        np.random.seed(42)
-        n_samples, n_neighbors = 4003, 4
-        # Test continuous implementation
-        Xy = np.random.randn(n_samples, 3)
-        cmi_ij_pair = _compute_cmip_c(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors)
-        cmi_ji_pair = _compute_cmip_c(Xy[:, 1], Xy[:, 0], Xy[:, 2], n_neighbors=n_neighbors)
-        self.assertAlmostEqual(
-            sum(cmi_ij_pair), sum(cmi_ji_pair), places=3,
-            msg="Computation for continuous conditional mutual information is not symmetric")
-
-        c1 = np.random.randn(n_samples,)
-        c2 = np.random.randn(n_samples,)
-        n_ss = 1001
-        d = np.hstack((
-            np.random.randint(-2, 1, (n_ss, )),
-            np.random.randint(10, 14, (n_samples-n_ss, ))))
-        np.random.shuffle(d)
-        cmi_ij_pair = _compute_cmip_ccd(c1, c2, d, n_neighbors=n_neighbors)
-        cmi_ji_pair = _compute_cmip_ccd(c2, c1, d, n_neighbors=n_neighbors)
-        self.assertAlmostEqual(
-            sum(cmi_ij_pair), sum(cmi_ji_pair), places=3,
-            msg="Computation for mixed continuous/discrete conditional mutual information is not symmetric")
-
-    def test_cmi_discrete(self):
-        """
-        We test the algorithm using the formula `I(x;y|z) = I(x;y) + I(z;y) - I(z;y|x)`.
-        Since the mutual information data is an sklearn function,
-        :func:`sklearn.feature_selection._mutual_info.mutual_info_classif`
-        it is highly likely that formula is valid only if our algorithm is correct.
-        """
-        np.random.seed(42)
-        # Test discrete implementation
-        n_samples, n_ss, n_neighbors = 103, 51, 4
-        xi = np.random.randint(0, 11, (n_samples, ))
-        xj = np.hstack((
-            np.random.randint(-2, 1, (n_ss, )),
-            np.random.randint(10, 14, (n_samples-n_ss, ))))
-        np.random.shuffle(xi)
-        np.random.shuffle(xj)
-        y = np.random.randint(-12, -10, (n_samples, ))
-        cmi_ij, cmi_ji = _compute_cmip_d(xi, xj, y)
-        mi_i = mutual_info_classif(
-            xi.reshape(-1, 1), y, discrete_features=True, n_neighbors=n_neighbors)
-        mi_j = mutual_info_classif(
-            xj.reshape(-1, 1), y, discrete_features=True, n_neighbors=n_neighbors)
-        self.assertTrue(
-            np.allclose(cmi_ij + mi_j - mi_i, cmi_ji, atol=1e-5),
-            msg="The chain rule for discrete conditional mutual information is violated")
-        cmi_ij2 = _compute_cmip_d(xj, xi, y)
-        self.assertTrue(
-            np.allclose(sum(cmi_ij2), cmi_ji+cmi_ij, atol=1e-5),
-            msg="Discrete conditional mutual information computation is not symmetric")
-
-    def test_mi(self):
-        from sklearn.feature_selection._mutual_info import _compute_mi_cd, _compute_mi_cc
-        from sklearn.metrics.cluster import mutual_info_score
-        # Test discrete implementation
-        n_samples, n_ss, n_neighbors, random_state = 103, 51, 4, 15
-        np.random.seed(random_state)
-        xi = np.random.randint(0, 11, (n_samples, ))
-        xj = np.hstack((
-            np.random.randint(-2, 1, (n_ss, )),
-            np.random.randint(10, 14, (n_samples-n_ss, ))))
-        np.random.shuffle(xi)
-        np.random.shuffle(xj)
-        xi_c = scale(np.random.randn(n_samples), with_mean=False)
-        xj_c = scale(np.random.randn(n_samples), with_mean=False)
-
-        mi_ij = _compute_mi(
-            xi, xj, True, True, n_neighbors=n_neighbors)
-        mi_ij_cl = mutual_info_score(xi, xj)
-        self.assertTrue(
-            np.allclose(mi_ij_cl, mi_ij),
-            msg="Discrete mutual information is not computed correctly")
-
-        mi_ij = _compute_mi(
-            xi_c, xj, False, True, n_neighbors=n_neighbors)
-        mi_ij_cl = _compute_mi_cd(xi_c, xj, n_neighbors=n_neighbors)
-        self.assertTrue(
-            np.allclose(mi_ij_cl, mi_ij, atol=1e-5),
-            msg=f"Error in continuous features/discrete target MI estimation is larger than expected {abs(mi_ij_cl-mi_ij)}")
-
-        mi_ij = _compute_mi(xi_c, xj_c, False, False, n_neighbors=n_neighbors)
-        mi_ij_cl = _compute_mi_cc(xi_c, xj_c, n_neighbors=n_neighbors)
-        self.assertTrue(
-            np.allclose(mi_ij_cl, mi_ij, atol=1e-5),
-            msg=f"Error in purely continuous MI is larger than expected {abs(mi_ij_cl-mi_ij)}")
-
-    def test_cmi_continuous(self):
-        # methods from https://github.com/jannisteunissen/mutual_information
-        def _compute_cmi_t(x, z, y, n_neighbors):
-            n_samples = len(x)
-
-            x = x.reshape(-1, 1)
-            y = y.reshape(-1, 1)
-            z = z.reshape(-1, 1)
-            xyz = np.hstack((x, y, z))
-            k = np.full(n_samples, n_neighbors)
-            radius = get_radius_kneighbors(xyz, n_neighbors)
-
-            mask = (radius == 0)
-            if mask.sum() > 0:
-                vals, ix, counts = np.unique(xyz[mask], axis=0,
-                                             return_inverse=True,
-                                             return_counts=True)
-                k[mask] = counts[ix] - 1
-
-            nxz = num_points_within_radius(np.hstack((x, z)), radius)
-            nyz = num_points_within_radius(np.hstack((y, z)), radius)
-            nz = num_points_within_radius(z, radius)
-
-            cmi = max(0, np.mean(digamma(k)) - np.mean(digamma(nxz + 1))
-                      - np.mean(digamma(nyz + 1)) + np.mean(digamma(nz + 1)))
-            return cmi
-
-        def get_radius_kneighbors(x, n_neighbors):
-            """Determine smallest radius around x containing n_neighbors neighbors
-
-            :param x: ndarray, shape (n_samples, n_dim)
-            :param n_neighbors: number of neighbors
-            :returns: radius, shape (n_samples,)
-            """
-            # Use KDTree for simplicity (sometimes a ball tree could be faster)
-            kd = KDTree(x, metric="chebyshev")
-
-            # Results include point itself, therefore n_neighbors+1
-            neigh_dist = kd.query(x, k=n_neighbors+1)[0]
-
-            # Take radius slightly larger than distance to last neighbor
-            radius = np.nextafter(neigh_dist[:, -1], 0)
-            return radius
-
-        def num_points_within_radius(x, radius):
-            """For each point, determine the number of other points within a given radius
-
-            :param x: ndarray, shape (n_samples, n_dim)
-            :param radius: radius, shape (n_samples,)
-            :returns: number of points within radius
-            """
-            kd = KDTree(x, metric="chebyshev")
-            nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
-            return np.array(nx) - 1.0
-
-        np.random.seed(42)
-        n_samples, n_neighbors = 103, 4
-        # Test continuous implementation
-        Xy = np.random.randn(n_samples, 3)
-        cmi_t_01 = _compute_cmi_t(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors)
-        cmi_t_10 = _compute_cmi_t(Xy[:, 1], Xy[:, 0], Xy[:, 2], n_neighbors=n_neighbors)
-
-        cmi_ij, cmi_ji = _compute_cmip_c(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors)
-
-        self.assertAlmostEqual(
-            max(cmi_t_01, 0), cmi_ij, places=5,
-            msg="The algorithm doesn't match the original implementation")
-
-        self.assertAlmostEqual(
-            max(cmi_t_10, 0), cmi_ji, places=5,
-            msg="The algorithm doesn't match the original implementation")

From 4d29725e284f4c35eb7c6d0f82af7e3591ce43cf Mon Sep 17 00:00:00 2001
From: Aivar Sootla <asootla@dwavesys.com>
Date: Wed, 24 Apr 2024 11:03:13 +0100
Subject: [PATCH 10/12] import bug fix

---
 dwave/plugins/sklearn/_conditional_mutual_info.py | 1 +
 dwave/plugins/sklearn/transformers.py             | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/dwave/plugins/sklearn/_conditional_mutual_info.py b/dwave/plugins/sklearn/_conditional_mutual_info.py
index 9e5a546..36eec3c 100644
--- a/dwave/plugins/sklearn/_conditional_mutual_info.py
+++ b/dwave/plugins/sklearn/_conditional_mutual_info.py
@@ -55,6 +55,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from itertools import combinations
 import typing
 
 import numpy as np
diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py
index db444ee..aa20d06 100644
--- a/dwave/plugins/sklearn/transformers.py
+++ b/dwave/plugins/sklearn/transformers.py
@@ -14,7 +14,6 @@
 
 from __future__ import annotations
 
-from itertools import combinations
 import tempfile
 import typing
 

From 9560f1dea31e1b709793e71de435ba7493c5ff34 Mon Sep 17 00:00:00 2001
From: Aivar Sootla <asootla@dwavesys.com>
Date: Wed, 24 Apr 2024 15:31:09 +0100
Subject: [PATCH 11/12] making mi cqm consistent with the correlation based
 algorithm

---
 dwave/plugins/sklearn/transformers.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py
index aa20d06..91795ac 100644
--- a/dwave/plugins/sklearn/transformers.py
+++ b/dwave/plugins/sklearn/transformers.py
@@ -239,13 +239,11 @@ def mutual_information_cqm(
                 Class labels formatted as a numerical 1D array-like.
             alpha:
                 Hyperparameter between 0 and 1 that controls the relative weight of
-                the relevance and redundancy terms.
-                ``alpha=0`` places no weight on the quality of the features,
-                therefore the features will be selected as to minimize the
-                redundancy without any consideration to quality.
-                ``alpha=1`` places the maximum weight on the quality of the features,
+                the relevance and redundancy terms. Active if conditional=False
+                ``alpha=0`` places the maximum weight on the feature redundancy.
+                ``alpha=1`` places no weight on the feature redudancy,
                 and therefore will be equivalent to using
-                :class:`sklearn.feature_selection.SelectKBest`.
+                :class:`sklearn.feature_selection.SelectKBest` the with mutual information metric.
             num_features:
                 The number of features to select.
             strict:
@@ -312,8 +310,8 @@ def mutual_information_cqm(
             # method from [1]_.
             # mutpliypling off-diagonal ones with -1
             diagonal = -np.diag(mi).copy()
-            # mutpliypling off-diagonal ones with alpha
-            np.multiply(mi, alpha, out=mi)
+            # mutpliypling off-diagonal ones with 1-alpha
+            np.multiply(mi, (1-alpha), out=mi)
             np.fill_diagonal(mi, diagonal)
 
         it = np.nditer(mi, flags=['multi_index'], op_flags=[['readonly']])

From 85aa2fba1551a740d6ce8acb37834fdffca11988 Mon Sep 17 00:00:00 2001
From: Aivar Sootla <asootla@dwavesys.com>
Date: Fri, 26 Apr 2024 12:06:47 +0000
Subject: [PATCH 12/12] add alpha tuning dial to CMI method

---
 dwave/plugins/sklearn/transformers.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py
index 91795ac..1cb8f50 100644
--- a/dwave/plugins/sklearn/transformers.py
+++ b/dwave/plugins/sklearn/transformers.py
@@ -227,7 +227,7 @@ def mutual_information_cqm(
 
         If ``conditional`` is True then the conditional mutual information
         criterion from [2] is used, and if ``conditional`` is False then
-        mutual information based criterion from [1] is used.
+        mutual information based criteria from [1] are used.
 
         For computation of mutual information and conditional mutual information
 
@@ -239,11 +239,14 @@ def mutual_information_cqm(
                 Class labels formatted as a numerical 1D array-like.
             alpha:
                 Hyperparameter between 0 and 1 that controls the relative weight of
-                the relevance and redundancy terms. Active if conditional=False
+                the relevance and redundancy terms.
                 ``alpha=0`` places the maximum weight on the feature redundancy.
                 ``alpha=1`` places no weight on the feature redudancy,
                 and therefore will be equivalent to using
-                :class:`sklearn.feature_selection.SelectKBest` the with mutual information metric.
+                :class:`sklearn.feature_selection.SelectKBest`.
+                If conditional=True, ``alpha = 0`` is a default value in [2]_.
+                If conditional=False, ``alpha = (num_features - 1) / num_features``
+                is a default value in [1]_.
             num_features:
                 The number of features to select.
             strict:
@@ -303,16 +306,17 @@ def mutual_information_cqm(
             random_state=random_state, n_jobs=n_jobs,
             conditional=conditional)
 
+        diagonal = -np.diag(mi).copy()
         if conditional:
-            # method from [2]_.
-            np.multiply(mi, -1, out=mi)
+            # method from [2]_ with another tuning dial.
+            # mutpliypling off-diagonal ones with -(1-alpha)
+            np.multiply(mi, -(1-alpha), out=mi)
         else:
             # method from [1]_.
-            # mutpliypling off-diagonal ones with -1
-            diagonal = -np.diag(mi).copy()
             # mutpliypling off-diagonal ones with 1-alpha
-            np.multiply(mi, (1-alpha), out=mi)
-            np.fill_diagonal(mi, diagonal)
+            np.multiply(mi, 1-alpha, out=mi)
+        # keeping the diagonal
+        np.fill_diagonal(mi, diagonal)
 
         it = np.nditer(mi, flags=['multi_index'], op_flags=[['readonly']])
         cqm.set_objective((*it.multi_index, x) for x in it if x)