From 0b188101e0a3cd9a5f9815b707c4c74ac4aa59d4 Mon Sep 17 00:00:00 2001 From: Aivar Sootla Date: Fri, 15 Mar 2024 16:20:47 +0000 Subject: [PATCH 01/12] Update objective computation: multiply quadratic biases with (1-alpha) --- dwave/plugins/sklearn/transformers.py | 30 ++++++++++++------- .../algorithm-update-d73cd4f7c854a8b3.yaml | 3 ++ 2 files changed, 22 insertions(+), 11 deletions(-) create mode 100644 releasenotes/notes/algorithm-update-d73cd4f7c854a8b3.yaml diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py index aa186be..f0fe455 100644 --- a/dwave/plugins/sklearn/transformers.py +++ b/dwave/plugins/sklearn/transformers.py @@ -185,7 +185,8 @@ def correlation_cqm( label=f"{num_features}-hot", ) - with tempfile.TemporaryFile() as fX, tempfile.TemporaryFile() as fout: + with tempfile.TemporaryFile() as fX, tempfile.TemporaryFile() as fout,\ + tempfile.TemporaryFile() as targetcor_out: # we make a copy of X because we'll be modifying it in-place within # some of the functions X_copy = np.memmap(fX, X.dtype, mode="w+", shape=(X.shape[0], X.shape[1] + 1)) @@ -199,23 +200,30 @@ def correlation_cqm( mode="w+", shape=(X_copy.shape[1], X_copy.shape[1]), ) - + # make the matrix that will hold the output correlations + target_correlations = np.memmap( + targetcor_out, + dtype=np.result_type(X, y), + mode="w+", + shape=(X_copy.shape[1], 1), + ) # main calculation. It modifies X_copy in-place corrcoef(X_copy, out=correlations, rowvar=False, copy=False) - # we don't care about the direction of correlation in terms of # the penalty/quality np.absolute(correlations, out=correlations) - + # copying the correlations with the target column + target_correlations = correlations[:, -1].copy() + # multiplying all terms with (1 - alpha) + np.multiply(correlations, (1 - alpha), out=correlations) # our objective - # we multiply by 2 because the matrix is symmetric - np.fill_diagonal(correlations, correlations[:, -1] * (-2 * alpha * num_features)) - - # Note: the full symmetric matrix (with both upper- and lower-diagonal - # entries for each correlation coefficient) is retained for consistency with - # the original formulation from Milne et al. + # we multiply by num_features to have consistent performance + # with the increase of the number of features + np.fill_diagonal(correlations, target_correlations * (- alpha * num_features)) + # Note: we only add terms on and above the diagonal it = np.nditer(correlations[:-1, :-1], flags=['multi_index'], op_flags=[['readonly']]) - cqm.set_objective((*it.multi_index, x) for x in it if x) + cqm.set_objective((*it.multi_index, x) for x in it + if it.multi_index[0] <= it.multi_index[1] and x) return cqm diff --git a/releasenotes/notes/algorithm-update-d73cd4f7c854a8b3.yaml b/releasenotes/notes/algorithm-update-d73cd4f7c854a8b3.yaml new file mode 100644 index 0000000..dc85f5f --- /dev/null +++ b/releasenotes/notes/algorithm-update-d73cd4f7c854a8b3.yaml @@ -0,0 +1,3 @@ +--- +fixes: + - Multiply off-diagonal terms of the correlation matrix with 1-alpha. \ No newline at end of file From 6981c61d8a59fb655d27fc67063bd7beb8194823 Mon Sep 17 00:00:00 2001 From: Aivar Sootla Date: Wed, 20 Mar 2024 10:01:22 +0000 Subject: [PATCH 02/12] address comments: replace a tempfile with direct computations, add a test --- dwave/plugins/sklearn/transformers.py | 20 +++++--------------- tests/test_transformer.py | 4 ++++ 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py index f0fe455..ada1ad3 100644 --- a/dwave/plugins/sklearn/transformers.py +++ b/dwave/plugins/sklearn/transformers.py @@ -185,8 +185,7 @@ def correlation_cqm( label=f"{num_features}-hot", ) - with tempfile.TemporaryFile() as fX, tempfile.TemporaryFile() as fout,\ - tempfile.TemporaryFile() as targetcor_out: + with tempfile.TemporaryFile() as fX, tempfile.TemporaryFile() as fout: # we make a copy of X because we'll be modifying it in-place within # some of the functions X_copy = np.memmap(fX, X.dtype, mode="w+", shape=(X.shape[0], X.shape[1] + 1)) @@ -199,27 +198,18 @@ def correlation_cqm( dtype=np.result_type(X, y), mode="w+", shape=(X_copy.shape[1], X_copy.shape[1]), - ) - # make the matrix that will hold the output correlations - target_correlations = np.memmap( - targetcor_out, - dtype=np.result_type(X, y), - mode="w+", - shape=(X_copy.shape[1], 1), - ) + ) # main calculation. It modifies X_copy in-place corrcoef(X_copy, out=correlations, rowvar=False, copy=False) # we don't care about the direction of correlation in terms of # the penalty/quality np.absolute(correlations, out=correlations) - # copying the correlations with the target column - target_correlations = correlations[:, -1].copy() - # multiplying all terms with (1 - alpha) - np.multiply(correlations, (1 - alpha), out=correlations) + # multiplying all but last columns and rows with (1 - alpha) + np.multiply(correlations[:-1, :-1], (1 - alpha), out=correlations[:-1, :-1]) # our objective # we multiply by num_features to have consistent performance # with the increase of the number of features - np.fill_diagonal(correlations, target_correlations * (- alpha * num_features)) + np.fill_diagonal(correlations, correlations[:, -1] * (- alpha * num_features)) # Note: we only add terms on and above the diagonal it = np.nditer(correlations[:-1, :-1], flags=['multi_index'], op_flags=[['readonly']]) cqm.set_objective((*it.multi_index, x) for x in it diff --git a/tests/test_transformer.py b/tests/test_transformer.py index 14214cc..6e0ef3e 100644 --- a/tests/test_transformer.py +++ b/tests/test_transformer.py @@ -126,6 +126,10 @@ def test_alpha_0(self): cqm = SelectFromQuadraticModel.correlation_cqm(self.X, self.y, num_features=3, alpha=0) self.assertTrue(not any(cqm.objective.linear.values())) + def test_alpha_1_no_quadratic(self): + cqm = SelectFromQuadraticModel.correlation_cqm(self.X, self.y, num_features=3, alpha=1) + self.assertTrue(not any(cqm.objective.quadratic.values())) + def test_alpha_1(self): rng = np.random.default_rng(42) From 25634b69e1d893c161a8661a632e0402cd5d80d0 Mon Sep 17 00:00:00 2001 From: Aivar Sootla Date: Tue, 2 Apr 2024 14:43:15 +0000 Subject: [PATCH 03/12] [WIP] adding feature selection using (conditional) mutual information --- dwave/plugins/sklearn/transformers.py | 154 ++++++++-- dwave/plugins/sklearn/utilities.py | 274 ++++++++++++++++++ .../notes/mutual-info-ddd1a23e320f88ae.yaml | 4 + tests/test_utilities.py | 104 ++++++- 4 files changed, 504 insertions(+), 32 deletions(-) create mode 100644 releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py index ada1ad3..9773e99 100644 --- a/dwave/plugins/sklearn/transformers.py +++ b/dwave/plugins/sklearn/transformers.py @@ -30,7 +30,7 @@ from sklearn.feature_selection import SelectorMixin from sklearn.utils.validation import check_is_fitted -from dwave.plugins.sklearn.utilities import corrcoef +from dwave.plugins.sklearn.utilities import corrcoef, estimate_mi_matrix __all__ = ["SelectFromQuadraticModel"] @@ -57,7 +57,7 @@ class SelectFromQuadraticModel(SelectorMixin, BaseEstimator): ACCEPTED_METHODS = [ "correlation", - # "mutual information", # todo + "mutual_information", ] def __init__( @@ -153,27 +153,6 @@ def correlation_cqm( https://1qbit.com/whitepaper/optimal-feature-selection-in-credit-scoring-classification-using-quantum-annealer """ - X = np.atleast_2d(np.asarray(X)) - y = np.asarray(y) - - if X.ndim != 2: - raise ValueError("X must be a 2-dimensional array-like") - - if y.ndim != 1: - raise ValueError("y must be a 1-dimensional array-like") - - if y.shape[0] != X.shape[0]: - raise ValueError(f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}") - - if not 0 <= alpha <= 1: - raise ValueError(f"alpha must be between 0 and 1, given {alpha}") - - if num_features <= 0: - raise ValueError(f"num_features must be a positive integer, given {num_features}") - - if X.shape[0] <= 1: - raise ValueError("X must have at least two rows") - cqm = dimod.ConstrainedQuadraticModel() cqm.add_variables(dimod.BINARY, X.shape[1]) @@ -212,11 +191,113 @@ def correlation_cqm( np.fill_diagonal(correlations, correlations[:, -1] * (- alpha * num_features)) # Note: we only add terms on and above the diagonal it = np.nditer(correlations[:-1, :-1], flags=['multi_index'], op_flags=[['readonly']]) - cqm.set_objective((*it.multi_index, x) for x in it - if it.multi_index[0] <= it.multi_index[1] and x) + print(correlations) + cqm.set_objective((*it.multi_index, x) for x in it if x) return cqm + @staticmethod + def mutual_information_cqm( + X: npt.ArrayLike, + y: npt.ArrayLike, + *, + num_features: int, + alpha: float = 0.5, + strict: bool = True, + conditional: bool = True, + discrete_features: typing.Union[str, npt.ArrayLike] = "auto", + discrete_target: bool = False, + copy: bool = True, + n_neighbors: int = 4, + n_workers: int = 1, + random_state: typing.Union[None, int] = None, + ) -> dimod.ConstrainedQuadraticModel: + """Build a constrained quadratic model for feature selection. + + If ``conditional`` is True then the conditional mutual information + criterion from [2] is used, and if ``conditional`` is False then + mutual information based criterion from [1] is used. + + For computation of mutual information and conditional mutual information + + + Args: + X: + Feature vectors formatted as a numerical 2D array-like. + y: + Class labels formatted as a numerical 1D array-like. + alpha: + Hyperparameter between 0 and 1 that controls the relative weight of + the relevance and redundancy terms. + ``alpha=0`` places no weight on the quality of the features, + therefore the features will be selected as to minimize the + redundancy without any consideration to quality. + ``alpha=1`` places the maximum weight on the quality of the features, + and therefore will be equivalent to using + :class:`sklearn.feature_selection.SelectKBest`. + num_features: + The number of features to select. + strict: + If ``False`` the constraint on the number of selected features + is ``<=`` rather than ``==``. + conditional: bool, default=True + Whether to compute the off-diagonal terms using the conditional mutual + information or joint mutual information + discrete_features: + See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + discrete_target: + See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + n_neighbors: + See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + copy: + See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + random_state: + See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + n_workers: int, default=1 + Number of workers for parallel computation on the cpu + + Returns: + A constrained quadratic model. + + References: + .. [1] Peng, F. Long, and C. Ding. Feature selection based on mutual information criteria of max-dependency, + max-relevance, and min-redundancy. IEEE Transactions on pattern analysis and machine intelligence, + 27(8):1226–1238, 2005. + .. [2] X. V. Nguyen, J. Chan, S. Romano, and J. Bailey. Effective global approaches for mutual information + based feature selection. In Proceedings of the 20th ACM SIGKDD international conference on + Knowledge discovery and data mining, pages 512–521. ACM, 2014. + """ + + cqm = dimod.ConstrainedQuadraticModel() + cqm.add_variables(dimod.BINARY, X.shape[1]) + + # add the k-hot constraint + cqm.add_constraint( + ((v, 1) for v in cqm.variables), + '==' if strict else '<=', + num_features, + label=f"{num_features}-hot", + ) + + mi = estimate_mi_matrix( + X, y, discrete_features, discrete_target, + n_neighbors=n_neighbors, copy=copy, + random_state=random_state, n_workers=n_workers, + conditional=conditional) + + if not conditional: + # mutliplying all features with num_features + np.multiply(mi, num_features, out=mi) + # mutpliypling off-diagonal ones with -(1-alpha) + np.multiply(mi, -(1 - alpha), out=mi) + # mutpliypling off-diagonal ones with alpha + diagonal = alpha * np.diag(mi) + np.fill_diagonal(mi, diagonal) + + it = np.nditer(mi, flags=['multi_index'], op_flags=[['readonly']]) + cqm.set_objective((*it.multi_index, x) for x in it if x) + return cqm + def fit( self, X: npt.ArrayLike, @@ -225,6 +306,7 @@ def fit( alpha: typing.Optional[float] = None, num_features: typing.Optional[int] = None, time_limit: typing.Optional[float] = None, + **kwargs ) -> SelectFromQuadraticModel: """Select the features to keep. @@ -253,18 +335,30 @@ def fit( This instance of `SelectFromQuadraticModel`. """ X = np.atleast_2d(np.asarray(X)) + y = np.asarray(y) + if X.ndim != 2: raise ValueError("X must be a 2-dimensional array-like") + if y.ndim != 1: + raise ValueError("y must be a 1-dimensional array-like") - # y is checked by the correlation method function + if y.shape[0] != X.shape[0]: + raise ValueError(f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}") + + if X.shape[0] <= 1: + raise ValueError("X must have at least two rows") if alpha is None: alpha = self.alpha - # alpha is checked by the correlation method function if num_features is None: num_features = self.num_features - # num_features is checked by the correlation method function + + if not 0 <= alpha <= 1: + raise ValueError(f"alpha must be between 0 and 1, given {alpha}") + + if num_features <= 0: + raise ValueError(f"num_features must be a positive integer, given {num_features}") # time_limit is checked by the LeapHybridCQMSampelr @@ -275,8 +369,8 @@ def fit( if self.method == "correlation": cqm = self.correlation_cqm(X, y, num_features=num_features, alpha=alpha) - # elif self.method == "mutual information": - # cqm = self.mutual_information_cqm(X, y, num_features=num_features) + elif self.method == "mutual_information": + cqm = self.mutual_information_cqm(X, y, num_features=num_features, alpha=alpha, **kwargs) else: raise ValueError(f"only methods {self.acceptable_methods} are implemented") diff --git a/dwave/plugins/sklearn/utilities.py b/dwave/plugins/sklearn/utilities.py index caccfcd..9670651 100644 --- a/dwave/plugins/sklearn/utilities.py +++ b/dwave/plugins/sklearn/utilities.py @@ -47,10 +47,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +from itertools import combinations import typing +from joblib import Parallel, delayed, cpu_count import numpy as np import numpy.typing as npt +from scipy.sparse import issparse +from scipy.special import digamma +from scipy.stats import entropy +from sklearn.feature_selection._mutual_info import _compute_mi, _iterate_columns +from sklearn.neighbors import NearestNeighbors, KDTree +from sklearn.preprocessing import scale +from sklearn.utils import check_random_state +from sklearn.utils.validation import check_array, check_X_y __all__ = ["corrcoef", "cov", "dot_2d"] @@ -224,3 +234,267 @@ def dot_2d(a: npt.ArrayLike, b: npt.ArrayLike, *, out.flush() return out + + +def estimate_mi_matrix( + X: npt.ArrayLike, + y: npt.ArrayLike, + discrete_features: typing.Union[str, npt.ArrayLike]="auto", + discrete_target: bool = False, + n_neighbors: int = 4, + conditional: bool = True, + copy: bool = True, + random_state: typing.Union[None, int] = None, + n_workers: int = 1, + ) -> npt.ArrayLike: + """ + For the feature array `X` and the target array `y` computes + the matrix of (conditional) mutual information interactions. + + cmi_{i, j} = I(x_i; y) + + If `conditional = True`, then the off-diagonal terms are computed: + + cmi_{i, j} = (I(x_i; y| x_j) + I(x_j; y| x_i)) / 2 + + Otherwise + + cmi_{i, j} = I(x_i; x_j) + + Computation of I(x; y) uses the scikit-learn implementation, i.e., + :func:`sklearn.feature_selection._mutual_info._estimate_mi`. The computation + of I(x; y| z) is based on + https://github.com/jannisteunissen/mutual_information + + Args: + X: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + + y: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + + conditional: bool, default=True + Whether to compute the off-diagonal terms using the conditional mutual + information or joint mutual information + + discrete_features: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + + discrete_target: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + + n_neighbors: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + + copy: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + + random_state: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + + n_workers: int, default=1 + Number of workers for parallel computation on the cpu + + Returns: + mi_matrix : ndarray, shape (n_features, n_features) + Interaction matrix between the features using (conditional) mutual information. + A negative value will be replaced by 0. + + References: + .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual + information". Phys. Rev. E 69, 2004. + .. [2] B. C. Ross "Mutual Information between Discrete and Continuous + Data Sets". PLoS ONE 9(2), 2014. + .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional + mutual information estimation for mixed, discrete and continuous + data." IEEE Transactions on Information Theory 67.1 (2020): 464-484. + """ + + X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target) + n_samples, n_features = X.shape + + if isinstance(discrete_features, (str, bool)): + if isinstance(discrete_features, str): + if discrete_features == "auto": + discrete_features = issparse(X) + else: + raise ValueError("Invalid string value for discrete_features.") + discrete_mask = np.empty(n_features, dtype=bool) + discrete_mask.fill(discrete_features) + else: + discrete_features = check_array(discrete_features, ensure_2d=False) + if discrete_features.dtype != "bool": + discrete_mask = np.zeros(n_features, dtype=bool) + discrete_mask[discrete_features] = True + else: + discrete_mask = discrete_features + + continuous_mask = ~discrete_mask + if np.any(continuous_mask) and issparse(X): + raise ValueError("Sparse matrix `X` can't have continuous features.") + + rng = check_random_state(random_state) + if np.any(continuous_mask): + if copy: + X = X.copy() + + X[:, continuous_mask] = scale( + X[:, continuous_mask], with_mean=False, copy=False + ) + + # Add small noise to continuous features as advised in Kraskov et. al. + X = X.astype(np.float64, copy=False) + means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0)) + X[:, continuous_mask] += ( + 1e-10 + * means + * rng.standard_normal(size=(n_samples, np.sum(continuous_mask))) + ) + + if not discrete_target: + y = scale(y, with_mean=False) + y += ( + 1e-10 + * np.maximum(1, np.mean(np.abs(y))) + * rng.standard_normal(size=n_samples) + ) + + n_features = X.shape[1] + max_n_workers = cpu_count()-1 + if n_workers > max_n_workers: + n_workers = max_n_workers + Warning(f"Specified number of workers {n_workers} is larger than the number of cpus." + f"Will use only {max_n_workers}.") + + mi_matrix = np.zeros((n_features, n_features), dtype=np.float64) + + off_diagonal_vals = Parallel(n_jobs=n_workers)( + delayed(_compute_off_diagonal_mi) + (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors, conditional) + for (xi, discrete_feature_i), (xj, discrete_feature_j) in combinations(zip(_iterate_columns(X), discrete_mask), 2) + ) + # keeping the discrete masks since the functions support it + diagonal_vals = Parallel(n_jobs=n_workers)( + delayed(_compute_mi) + (xi, y, discrete_feature_i, discrete_target, n_neighbors) + for xi, discrete_feature_i in zip(_iterate_columns(X), discrete_mask) + ) + np.fill_diagonal(mi_matrix, diagonal_vals) + off_diagonal_val = iter(off_diagonal_vals) + if conditional: + # Although we need to compute `(I(X_j; Y| X_i) + I(X_i; Y| X_j))/2` + # we can avoid the computation of the second term using the formula: + # `I(X_j; Y| X_i) = I(X_i; Y| X_j) + I(X_j; Y) - I(X_i; Y)` + for (i, d_i), (j, d_j) in combinations(enumerate(diagonal_vals), 2): + val = next(off_diagonal_val) + val += max(val + (d_j - d_i), 0) + mi_matrix[i, j] = mi_matrix[j, i] = val/2 + else: + for i, j in combinations(range(n_features), 2): + mi_matrix[i, j] = mi_matrix[j, i] = next(off_diagonal_val) + return mi_matrix + + +def _compute_off_diagonal_mi( + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike, + discrete_feature_i: bool, + discrete_feature_j: bool, + discrete_target: bool, + n_neighbors: int = 4, + conditional: bool = True, + ): + """ + Computing a distance `d` between features `x_i` and `x_j`. + If `conditional = True`, then conditional mutual infomation is used + `I(x_i; y | x_j)`. + + If `conditonal = False` then mutual information is used + `I(x_i; x_j)`. + + References + ---------- + .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual + information". Phys. Rev. E 69, 2004. + .. [2] B. C. Ross "Mutual Information between Discrete and Continuous + Data Sets". PLoS ONE 9(2), 2014. + .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional + mutual information estimation for mixed, discrete and continuous + data." IEEE Transactions on Information Theory 67.1 (2020): 464-484. + """ + if conditional: + if discrete_feature_i and discrete_feature_j and discrete_target: + return _compute_cmi_d(xi, xj, y) + else: + # TODO: consider adding treatement of mixed discrete + # and continuous variables + return _compute_cmi_c(xi, xj, y, n_neighbors) + else: + return _compute_mi(xi, xj, discrete_feature_i, discrete_feature_j, n_neighbors) + + +def _compute_cmi_d( + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike): + """ + Computes conditional mutual infomation `I(x_i; y | x_j)`. + All random variables `x_i`, `x_j` and `y` are discrete. + + Adpated from https://github.com/dwave-examples/mutual-information-feature-selection + """ + + # Computing joint probability distribution for the features and the target + unique_labels = [np.hstack((np.unique(col), np.inf)) for col in (xi, xj, y)] + prob, _ = np.histogramdd(np.vstack((xi, xj, y)).T, bins=unique_labels) + + cmi_ij = ( + entropy(prob.sum(axis=2).flatten()) # H(x_i, x_j) + +entropy(prob.sum(axis=0).flatten()) # H(x_j, y) + -entropy(prob.sum(axis=(0,2))) # H(x_j) + -entropy(prob.flatten()) # H(x_i, x_j, y) + ) + return cmi_ij + + +def _compute_cmi_c( + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike, + n_neighbors: int = 4): + """ + Computes conditional mutual infomation `(I(x_i; y | x_j)`. + At least one random variables from `x_i`, `x_j` and `y` is continuous. + + Adapted from https://github.com/jannisteunissen/mutual_information + """ + xi = xi.reshape((-1, 1)) + xj = xj.reshape((-1, 1)) + y = y.reshape((-1, 1)) + + # Here we rely on NearestNeighbors to select the fastest algorithm. + nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors) + nn.fit(np.hstack((xi, xj, y))) + radius = nn.kneighbors()[0] + radius = np.nextafter(radius[:, -1], 0) + + # KDTree is explicitly fit to allow for the querying of number of + # neighbors within a specified radius + n_j = _num_points_within_radius(xj, radius) + n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius) + n_jy = _num_points_within_radius(np.hstack((y, xj)), radius) + + cmi_ij = ( + digamma(n_neighbors) + +np.mean(digamma(n_j)) + -np.mean(digamma(n_ij)) + -np.mean(digamma(n_jy))) + return max(0, cmi_ij) + + +def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike): + """For each point, determine the number of other points within a given radius + Function from https://github.com/jannisteunissen/mutual_information + + :param x: ndarray, shape (n_samples, n_dim) + :param radius: radius, shape (n_samples,) + :returns: number of points within radius + + """ + kd = KDTree(x, metric="chebyshev") + nx = kd.query_radius(x, radius, count_only=True, return_distance=False) + return np.array(nx) diff --git a/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml b/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml new file mode 100644 index 0000000..c3ed08c --- /dev/null +++ b/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Add feature selection method using (conditional) mutual information diff --git a/tests/test_utilities.py b/tests/test_utilities.py index 107f4d8..df2a35e 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -51,8 +51,10 @@ import numpy as np -from dwave.plugins.sklearn.utilities import corrcoef, cov, dot_2d - +from dwave.plugins.sklearn.utilities import corrcoef, cov, dot_2d, _compute_cmi_c, _compute_cmi_d +from scipy.special import digamma +from sklearn.feature_selection._mutual_info import _compute_mi +from sklearn.neighbors import KDTree class TestCorrCoef(unittest.TestCase): def test_agreement(self): @@ -178,3 +180,101 @@ def test_memmap(self): out = np.memmap(fout, "float64", mode="w+", shape=(X.shape[0], X.shape[0])) dot_2d(X, X.T, out=out) + + +class TestMI(unittest.TestCase): + def test_cmi(self): + """ + We test the algorithm using the formula `I(x;y|z) = I(x;y) + I(z;y) - I(z;y|x)`. + Since the mutual information data is an sklearn function, + :func:`sklearn.feature_selection._mutual_info._compute_mi` + it is highly likely that formula is valid only if our algorithm is correct. + For continous variables the formula may still break for some seeds + """ + np.random.seed(42) + n_samples, n_neighbors = 4003, 4 + # Test continuous implementation + Xy = np.random.randn(n_samples, 3) + cmi_ij = _compute_cmi_c(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors) + cmi_ji = _compute_cmi_c(Xy[:, 1], Xy[:, 0], Xy[:, 2], n_neighbors=n_neighbors) + mi_i = _compute_mi(Xy[:, 0], Xy[:, 2], False, False, n_neighbors=n_neighbors) + mi_j = _compute_mi(Xy[:, 1], Xy[:, 2], False, False, n_neighbors=n_neighbors) + self.assertAlmostEqual( + max(cmi_ij + mi_j - mi_i, 0), cmi_ji, places=3, + msg="The formula for continuous conditional mutual information is violated") + + # Test discrete implementation + n_samples, n_ss = 103, 51 + xi = np.random.randint(0, 11, (n_samples, )) + xj = np.hstack(( + np.random.randint(-2, 1, (n_ss, )), + np.random.randint(10, 14, (n_samples-n_ss, )))) + np.random.shuffle(xi) + np.random.shuffle(xj) + y = np.random.randint(-12, -10, (n_samples, )) + cmi_ij = _compute_cmi_d(xi, xj, y) + cmi_ji = _compute_cmi_d(xj, xi, y) + mi_i = _compute_mi(xi, y, True, True, n_neighbors=n_neighbors) + mi_j = _compute_mi(xj, y, True, True, n_neighbors=n_neighbors) + self.assertAlmostEqual( + cmi_ij + mi_j - mi_i, cmi_ji, places=5, + msg="The formula for discrete conditional mutual information is violated") + + def test_implementation(self): + # methods from https://github.com/jannisteunissen/mutual_information + def _compute_cmi_t(x, z, y, n_neighbors): + n_samples = len(x) + + x = x.reshape(-1, 1) + y = y.reshape(-1, 1) + z = z.reshape(-1, 1) + xyz = np.hstack((x, y, z)) + k = np.full(n_samples, n_neighbors) + radius = get_radius_kneighbors(xyz, n_neighbors) + + nxz = num_points_within_radius(np.hstack((x, z)), radius) + nyz = num_points_within_radius(np.hstack((y, z)), radius) + nz = num_points_within_radius(z, radius) + + cmi = max(0, np.mean(digamma(k)) - np.mean(digamma(nxz + 1)) + - np.mean(digamma(nyz + 1)) + np.mean(digamma(nz + 1))) + return cmi + + def get_radius_kneighbors(x, n_neighbors): + """Determine smallest radius around x containing n_neighbors neighbors + + :param x: ndarray, shape (n_samples, n_dim) + :param n_neighbors: number of neighbors + :returns: radius, shape (n_samples,) + """ + # Use KDTree for simplicity (sometimes a ball tree could be faster) + kd = KDTree(x, metric="chebyshev") + + # Results include point itself, therefore n_neighbors+1 + neigh_dist = kd.query(x, k=n_neighbors+1)[0] + + # Take radius slightly larger than distance to last neighbor + radius = np.nextafter(neigh_dist[:, -1], 0) + return radius + + def num_points_within_radius(x, radius): + """For each point, determine the number of other points within a given radius + + :param x: ndarray, shape (n_samples, n_dim) + :param radius: radius, shape (n_samples,) + :returns: number of points within radius + """ + kd = KDTree(x, metric="chebyshev") + nx = kd.query_radius(x, radius, count_only=True, return_distance=False) + return np.array(nx) - 1.0 + + n_samples, n_neighbors = 103, 4 + # Test continuous implementation + Xy = np.random.randn(n_samples, 3) + cmi_t = _compute_cmi_t(Xy[:,0], Xy[:,1], Xy[:,2], n_neighbors=n_neighbors) + + cmi_c = _compute_cmi_c(Xy[:,0], Xy[:,1], Xy[:,2], n_neighbors=n_neighbors) + + self.assertAlmostEqual( + cmi_t, cmi_c, places=5, + msg="The algorithm doesn't match the original implementation") From 31ae832155019ef64c28a23914017f9b943332ae Mon Sep 17 00:00:00 2001 From: Aivar Sootla Date: Mon, 8 Apr 2024 15:23:43 +0100 Subject: [PATCH 04/12] add computations for mixed distributions, respond to PR comments --- dwave/plugins/sklearn/transformers.py | 218 ++++++++++++-- dwave/plugins/sklearn/utilities.py | 409 +++++++++++++++----------- requirements.txt | 3 + setup.cfg | 5 +- tests/test_transformer.py | 8 +- tests/test_utilities.py | 94 ++++-- 6 files changed, 505 insertions(+), 232 deletions(-) diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py index 9773e99..88dc149 100644 --- a/dwave/plugins/sklearn/transformers.py +++ b/dwave/plugins/sklearn/transformers.py @@ -14,23 +14,25 @@ from __future__ import annotations -import itertools -import logging +from itertools import combinations import tempfile import typing -import warnings import dimod -import numpy as np -import numpy.typing as npt - +from dwave.plugins.sklearn.utilities import corrcoef, _compute_off_diagonal_mi, tqdm_joblib from dwave.cloud.exceptions import ConfigFileError, SolverAuthenticationError from dwave.system import LeapHybridCQMSampler +from joblib import Parallel, delayed, cpu_count +import numpy as np +import numpy.typing as npt +from scipy.sparse import issparse from sklearn.base import BaseEstimator from sklearn.feature_selection import SelectorMixin -from sklearn.utils.validation import check_is_fitted - -from dwave.plugins.sklearn.utilities import corrcoef, estimate_mi_matrix +from sklearn.feature_selection._mutual_info import _compute_mi, _iterate_columns +from sklearn.preprocessing import scale +from sklearn.utils import check_random_state +from sklearn.utils.validation import check_is_fitted, check_array, check_X_y +from tqdm import tqdm __all__ = ["SelectFromQuadraticModel"] @@ -110,8 +112,8 @@ def _get_support_mask(self) -> np.ndarray[typing.Any, np.dtype[np.bool_]]: except AttributeError: raise RuntimeError("fit hasn't been run yet") - @staticmethod def correlation_cqm( + self, X: npt.ArrayLike, y: npt.ArrayLike, *, @@ -152,6 +154,7 @@ def correlation_cqm( 1QBit; White Paper. https://1qbit.com/whitepaper/optimal-feature-selection-in-credit-scoring-classification-using-quantum-annealer """ + self._check_params(X, y, alpha, num_features) cqm = dimod.ConstrainedQuadraticModel() cqm.add_variables(dimod.BINARY, X.shape[1]) @@ -191,13 +194,12 @@ def correlation_cqm( np.fill_diagonal(correlations, correlations[:, -1] * (- alpha * num_features)) # Note: we only add terms on and above the diagonal it = np.nditer(correlations[:-1, :-1], flags=['multi_index'], op_flags=[['readonly']]) - print(correlations) cqm.set_objective((*it.multi_index, x) for x in it if x) return cqm - @staticmethod - def mutual_information_cqm( + def mutual_information_cqm( + self, X: npt.ArrayLike, y: npt.ArrayLike, *, @@ -268,6 +270,8 @@ def mutual_information_cqm( Knowledge discovery and data mining, pages 512–521. ACM, 2014. """ + self._check_params(X, y, alpha, num_features) + cqm = dimod.ConstrainedQuadraticModel() cqm.add_variables(dimod.BINARY, X.shape[1]) @@ -335,30 +339,18 @@ def fit( This instance of `SelectFromQuadraticModel`. """ X = np.atleast_2d(np.asarray(X)) - y = np.asarray(y) - if X.ndim != 2: raise ValueError("X must be a 2-dimensional array-like") - if y.ndim != 1: - raise ValueError("y must be a 1-dimensional array-like") - if y.shape[0] != X.shape[0]: - raise ValueError(f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}") - - if X.shape[0] <= 1: - raise ValueError("X must have at least two rows") + # y is checked by the correlation method function if alpha is None: alpha = self.alpha + # alpha is checked by the correlation method function if num_features is None: num_features = self.num_features - - if not 0 <= alpha <= 1: - raise ValueError(f"alpha must be between 0 and 1, given {alpha}") - - if num_features <= 0: - raise ValueError(f"num_features must be a positive integer, given {num_features}") + # num_features is checked by the correlation method function # time_limit is checked by the LeapHybridCQMSampelr @@ -403,3 +395,173 @@ def fit( def unfit(self): """Undo a previously executed ``fit`` method.""" del self._mask + + @staticmethod + def _check_params(X: npt.ArrayLike, + y: npt.ArrayLike, + alpha: float, + num_features: int): + X = np.atleast_2d(np.asarray(X)) + y = np.asarray(y) + + if X.ndim != 2: + raise ValueError("X must be a 2-dimensional array-like") + + if y.ndim != 1: + raise ValueError("y must be a 1-dimensional array-like") + + if y.shape[0] != X.shape[0]: + raise ValueError(f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}") + + if not 0 <= alpha <= 1: + raise ValueError(f"alpha must be between 0 and 1, given {alpha}") + + if num_features <= 0: + raise ValueError(f"num_features must be a positive integer, given {num_features}") + + if X.shape[0] <= 1: + raise ValueError("X must have at least two rows") + + +def estimate_mi_matrix( + X: npt.ArrayLike, + y: npt.ArrayLike, + discrete_features: typing.Union[str, npt.ArrayLike]="auto", + discrete_target: bool = False, + n_neighbors: int = 4, + conditional: bool = True, + copy: bool = True, + random_state: typing.Union[None, int] = None, + n_workers: int = 1, + n_subsample: int = -1 + ) -> npt.ArrayLike: + """ + For the feature array `X` and the target array `y` computes + the matrix of (conditional) mutual information interactions. + + cmi_{i, j} = I(x_i; y) + + If `conditional = True`, then the off-diagonal terms are computed: + + cmi_{i, j} = (I(x_i; y| x_j) + I(x_j; y| x_i)) / 2 + + Otherwise + + cmi_{i, j} = I(x_i; x_j) + + Computation of I(x; y) uses the scikit-learn implementation, i.e., + :func:`sklearn.feature_selection._mutual_info._estimate_mi`. The computation + of I(x; y| z) is based on + https://github.com/jannisteunissen/mutual_information + + Args: + X: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + + y: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + + conditional: bool, default=True + Whether to compute the off-diagonal terms using the conditional mutual + information or joint mutual information + + discrete_features: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + + discrete_target: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + + n_neighbors: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + + copy: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + + random_state: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + + n_workers: int, default=1 + Number of workers for parallel computation on the cpu + + Returns: + mi_matrix : ndarray, shape (n_features, n_features) + Interaction matrix between the features using (conditional) mutual information. + A negative value will be replaced by 0. + + References: + .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual + information". Phys. Rev. E 69, 2004. + .. [2] B. C. Ross "Mutual Information between Discrete and Continuous + Data Sets". PLoS ONE 9(2), 2014. + .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional + mutual information estimation for mixed, discrete and continuous + data." IEEE Transactions on Information Theory 67.1 (2020): 464-484. + """ + + X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target) + n_samples, n_features = X.shape + + if isinstance(discrete_features, (str, bool)): + if isinstance(discrete_features, str): + if discrete_features == "auto": + discrete_features = issparse(X) + else: + raise ValueError("Invalid string value for discrete_features.") + discrete_mask = np.empty(n_features, dtype=bool) + discrete_mask.fill(discrete_features) + else: + discrete_features = check_array(discrete_features, ensure_2d=False) + if discrete_features.dtype != "bool": + discrete_mask = np.zeros(n_features, dtype=bool) + discrete_mask[discrete_features] = True + else: + discrete_mask = discrete_features + + continuous_mask = ~discrete_mask + if np.any(continuous_mask) and issparse(X): + raise ValueError("Sparse matrix `X` can't have continuous features.") + + rng = check_random_state(random_state) + if np.any(continuous_mask): + if copy: + X = X.copy() + + X[:, continuous_mask] = scale( + X[:, continuous_mask], with_mean=False, copy=False + ) + + # Add small noise to continuous features as advised in Kraskov et. al. + X = X.astype(np.float64, copy=False) + means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0)) + X[:, continuous_mask] += ( + 1e-10 + * means + * rng.standard_normal(size=(n_samples, np.sum(continuous_mask))) + ) + + if not discrete_target: + y = scale(y, with_mean=False) + y += ( + 1e-10 + * np.maximum(1, np.mean(np.abs(y))) + * rng.standard_normal(size=n_samples) + ) + + n_features = X.shape[1] + max_n_workers = cpu_count()-1 + if n_workers > max_n_workers: + n_workers = max_n_workers + Warning(f"Specified number of workers {n_workers} is larger than the number of cpus." + f"Will use only {max_n_workers}.") + + mi_matrix = np.zeros((n_features, n_features), dtype=np.float64) + with tqdm_joblib(tqdm(desc="Computing off-diagonal terms", total=len(discrete_mask) * (len(discrete_mask) - 1) // 2 )) as progress_bar: + off_diagonal_vals = Parallel(n_jobs=n_workers)( + delayed(_compute_off_diagonal_mi) + (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors, conditional, n_subsample) + for (xi, discrete_feature_i), (xj, discrete_feature_j) in combinations(zip(_iterate_columns(X), discrete_mask), 2) + ) + diagonal_vals = Parallel(n_jobs=n_workers)( + delayed(_compute_mi) + (xi, y, discrete_feature_i, discrete_target, n_neighbors) + for xi, discrete_feature_i in zip(_iterate_columns(X), discrete_mask) + ) + np.fill_diagonal(mi_matrix, diagonal_vals) + off_diagonal_val = iter(off_diagonal_vals) + for i, j in combinations(range(n_features), 2): + mi_matrix[i, j] = mi_matrix[j, i] = next(off_diagonal_val) + return mi_matrix + diff --git a/dwave/plugins/sklearn/utilities.py b/dwave/plugins/sklearn/utilities.py index 9670651..5910b53 100644 --- a/dwave/plugins/sklearn/utilities.py +++ b/dwave/plugins/sklearn/utilities.py @@ -47,20 +47,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -from itertools import combinations import typing -from joblib import Parallel, delayed, cpu_count +import contextlib +import joblib import numpy as np import numpy.typing as npt -from scipy.sparse import issparse from scipy.special import digamma from scipy.stats import entropy -from sklearn.feature_selection._mutual_info import _compute_mi, _iterate_columns +from sklearn.feature_selection._mutual_info import _compute_mi from sklearn.neighbors import NearestNeighbors, KDTree -from sklearn.preprocessing import scale -from sklearn.utils import check_random_state -from sklearn.utils.validation import check_array, check_X_y __all__ = ["corrcoef", "cov", "dot_2d"] @@ -235,158 +231,6 @@ def dot_2d(a: npt.ArrayLike, b: npt.ArrayLike, *, return out - -def estimate_mi_matrix( - X: npt.ArrayLike, - y: npt.ArrayLike, - discrete_features: typing.Union[str, npt.ArrayLike]="auto", - discrete_target: bool = False, - n_neighbors: int = 4, - conditional: bool = True, - copy: bool = True, - random_state: typing.Union[None, int] = None, - n_workers: int = 1, - ) -> npt.ArrayLike: - """ - For the feature array `X` and the target array `y` computes - the matrix of (conditional) mutual information interactions. - - cmi_{i, j} = I(x_i; y) - - If `conditional = True`, then the off-diagonal terms are computed: - - cmi_{i, j} = (I(x_i; y| x_j) + I(x_j; y| x_i)) / 2 - - Otherwise - - cmi_{i, j} = I(x_i; x_j) - - Computation of I(x; y) uses the scikit-learn implementation, i.e., - :func:`sklearn.feature_selection._mutual_info._estimate_mi`. The computation - of I(x; y| z) is based on - https://github.com/jannisteunissen/mutual_information - - Args: - X: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` - - y: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` - - conditional: bool, default=True - Whether to compute the off-diagonal terms using the conditional mutual - information or joint mutual information - - discrete_features: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` - - discrete_target: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` - - n_neighbors: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` - - copy: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` - - random_state: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` - - n_workers: int, default=1 - Number of workers for parallel computation on the cpu - - Returns: - mi_matrix : ndarray, shape (n_features, n_features) - Interaction matrix between the features using (conditional) mutual information. - A negative value will be replaced by 0. - - References: - .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual - information". Phys. Rev. E 69, 2004. - .. [2] B. C. Ross "Mutual Information between Discrete and Continuous - Data Sets". PLoS ONE 9(2), 2014. - .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional - mutual information estimation for mixed, discrete and continuous - data." IEEE Transactions on Information Theory 67.1 (2020): 464-484. - """ - - X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target) - n_samples, n_features = X.shape - - if isinstance(discrete_features, (str, bool)): - if isinstance(discrete_features, str): - if discrete_features == "auto": - discrete_features = issparse(X) - else: - raise ValueError("Invalid string value for discrete_features.") - discrete_mask = np.empty(n_features, dtype=bool) - discrete_mask.fill(discrete_features) - else: - discrete_features = check_array(discrete_features, ensure_2d=False) - if discrete_features.dtype != "bool": - discrete_mask = np.zeros(n_features, dtype=bool) - discrete_mask[discrete_features] = True - else: - discrete_mask = discrete_features - - continuous_mask = ~discrete_mask - if np.any(continuous_mask) and issparse(X): - raise ValueError("Sparse matrix `X` can't have continuous features.") - - rng = check_random_state(random_state) - if np.any(continuous_mask): - if copy: - X = X.copy() - - X[:, continuous_mask] = scale( - X[:, continuous_mask], with_mean=False, copy=False - ) - - # Add small noise to continuous features as advised in Kraskov et. al. - X = X.astype(np.float64, copy=False) - means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0)) - X[:, continuous_mask] += ( - 1e-10 - * means - * rng.standard_normal(size=(n_samples, np.sum(continuous_mask))) - ) - - if not discrete_target: - y = scale(y, with_mean=False) - y += ( - 1e-10 - * np.maximum(1, np.mean(np.abs(y))) - * rng.standard_normal(size=n_samples) - ) - - n_features = X.shape[1] - max_n_workers = cpu_count()-1 - if n_workers > max_n_workers: - n_workers = max_n_workers - Warning(f"Specified number of workers {n_workers} is larger than the number of cpus." - f"Will use only {max_n_workers}.") - - mi_matrix = np.zeros((n_features, n_features), dtype=np.float64) - - off_diagonal_vals = Parallel(n_jobs=n_workers)( - delayed(_compute_off_diagonal_mi) - (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors, conditional) - for (xi, discrete_feature_i), (xj, discrete_feature_j) in combinations(zip(_iterate_columns(X), discrete_mask), 2) - ) - # keeping the discrete masks since the functions support it - diagonal_vals = Parallel(n_jobs=n_workers)( - delayed(_compute_mi) - (xi, y, discrete_feature_i, discrete_target, n_neighbors) - for xi, discrete_feature_i in zip(_iterate_columns(X), discrete_mask) - ) - np.fill_diagonal(mi_matrix, diagonal_vals) - off_diagonal_val = iter(off_diagonal_vals) - if conditional: - # Although we need to compute `(I(X_j; Y| X_i) + I(X_i; Y| X_j))/2` - # we can avoid the computation of the second term using the formula: - # `I(X_j; Y| X_i) = I(X_i; Y| X_j) + I(X_j; Y) - I(X_i; Y)` - for (i, d_i), (j, d_j) in combinations(enumerate(diagonal_vals), 2): - val = next(off_diagonal_val) - val += max(val + (d_j - d_i), 0) - mi_matrix[i, j] = mi_matrix[j, i] = val/2 - else: - for i, j in combinations(range(n_features), 2): - mi_matrix[i, j] = mi_matrix[j, i] = next(off_diagonal_val) - return mi_matrix - def _compute_off_diagonal_mi( xi: npt.ArrayLike, @@ -397,6 +241,7 @@ def _compute_off_diagonal_mi( discrete_target: bool, n_neighbors: int = 4, conditional: bool = True, + n_subsample: int = -1 ): """ Computing a distance `d` between features `x_i` and `x_j`. @@ -417,48 +262,66 @@ def _compute_off_diagonal_mi( data." IEEE Transactions on Information Theory 67.1 (2020): 464-484. """ if conditional: + if n_subsample > 0: + idx = np.arange(xi.shape[0]) + np.random.shuffle(idx) + idx = idx[:n_subsample] + xi, xj, y = xi[idx], xj[idx], y[idx] if discrete_feature_i and discrete_feature_j and discrete_target: - return _compute_cmi_d(xi, xj, y) + val = _compute_cmip_d(xi, xj, y) + elif n_subsample >= 0 and discrete_target: + val = _compute_cmip_ccd(xi, xj, y, n_neighbors) + elif n_subsample >= 0 and discrete_feature_i and (not discrete_target): + val = _compute_cmip_cdc(xj, xi, y, n_neighbors) + elif n_subsample >= 0 and (not discrete_feature_i) and discrete_feature_j and (not discrete_target): + val = _compute_cmip_cdc(xi, xj, y, n_neighbors) else: - # TODO: consider adding treatement of mixed discrete - # and continuous variables - return _compute_cmi_c(xi, xj, y, n_neighbors) + val = _compute_cmip_c(xi, xj, y, n_neighbors) + return sum(val) / 2 else: return _compute_mi(xi, xj, discrete_feature_i, discrete_feature_j, n_neighbors) -def _compute_cmi_d( +def _compute_cmip_d( xi: npt.ArrayLike, xj: npt.ArrayLike, - y: npt.ArrayLike): + y: npt.ArrayLike) -> typing.Tuple[float]: """ - Computes conditional mutual infomation `I(x_i; y | x_j)`. + Computes conditional mutual infomation pair `I(x_i; y | x_j)` `I(x_j; y | x_i)`. All random variables `x_i`, `x_j` and `y` are discrete. Adpated from https://github.com/dwave-examples/mutual-information-feature-selection """ # Computing joint probability distribution for the features and the target - unique_labels = [np.hstack((np.unique(col), np.inf)) for col in (xi, xj, y)] - prob, _ = np.histogramdd(np.vstack((xi, xj, y)).T, bins=unique_labels) - + bin_boundaries = [np.hstack((np.unique(col), np.inf)) for col in (xi, xj, y)] + prob, _ = np.histogramdd(np.vstack((xi, xj, y)).T, bins=bin_boundaries) + Hijy = entropy(prob.flatten()) + Hij = entropy(prob.sum(axis=2).flatten()) cmi_ij = ( - entropy(prob.sum(axis=2).flatten()) # H(x_i, x_j) + Hij # H(x_i, x_j) +entropy(prob.sum(axis=0).flatten()) # H(x_j, y) -entropy(prob.sum(axis=(0,2))) # H(x_j) - -entropy(prob.flatten()) # H(x_i, x_j, y) + -Hijy # H(x_i, x_j, y) + ) + cmi_ji = ( + Hij # H(x_i, x_j) + +entropy(prob.sum(axis=1).flatten()) # H(x_i, y) + -entropy(prob.sum(axis=(1,2))) # H(x_i) + -Hijy # H(x_i, x_j, y) ) - return cmi_ij + return cmi_ij, cmi_ji - -def _compute_cmi_c( + +def _compute_cmip_c( xi: npt.ArrayLike, xj: npt.ArrayLike, y: npt.ArrayLike, - n_neighbors: int = 4): + n_neighbors: int = 4) -> typing.Tuple[float]: """ - Computes conditional mutual infomation `(I(x_i; y | x_j)`. - At least one random variables from `x_i`, `x_j` and `y` is continuous. + Computes conditional mutual information pair `(I(x_i; y | x_j)` + and `(I(x_j; y | x_i)`. All random variables `x_i`, `x_j` and `y` + are assumed to be continuous. Adapted from https://github.com/jannisteunissen/mutual_information """ @@ -478,12 +341,163 @@ def _compute_cmi_c( n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius) n_jy = _num_points_within_radius(np.hstack((y, xj)), radius) + n_i = _num_points_within_radius(xi, radius) + n_iy = _num_points_within_radius(np.hstack((y, xi)), radius) + cmi_ij = ( digamma(n_neighbors) +np.mean(digamma(n_j)) -np.mean(digamma(n_ij)) -np.mean(digamma(n_jy))) - return max(0, cmi_ij) + + cmi_ji = ( + digamma(n_neighbors) + +np.mean(digamma(n_i)) + -np.mean(digamma(n_ij)) + -np.mean(digamma(n_iy))) + return max(0, cmi_ij), max(0, cmi_ji) + + +def _compute_cmip_ccd( + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike, + n_neighbors: int = 4) -> typing.Tuple[float]: + """ + Computes conditional mutual information pair `(I(x_i; y | x_j)` + and `(I(x_j; y | x_i)`. Random variables `x_i`, `x_j` are assumed + to be continuous, while `y` is assumed to be discrete. + + Adapted from https://github.com/jannisteunissen/mutual_information + """ + xi = xi.reshape((-1, 1)) + xj = xj.reshape((-1, 1)) + + (radius, + label_counts, + k_all) = _get_radius_k_neighbours_d( + xi, xj, y, + n_neighbors=n_neighbors) + + # Ignore points with unique labels. + mask = label_counts > 1 + label_counts = label_counts[mask] + k_all = k_all[mask] + xi = xi[mask] + xj = xj[mask] + y = y[mask] + radius = radius[mask] + # KDTree is explicitly fit to allow for the querying of number of + # neighbors within a specified radius + + # continuous + n_i = _num_points_within_radius(xi, radius) + n_j = _num_points_within_radius(xj, radius) + n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius) + + # mixed estimates + n_iy = _num_points_within_radius_cd(xi, y, radius) + n_jy = _num_points_within_radius_cd(xj, y, radius) + + common_terms = ( + np.mean(digamma(k_all)) + -np.mean(digamma(n_ij))) + cmi_ij = ( + common_terms + +np.mean(digamma(n_j)) + -np.mean(digamma(n_jy))) + cmi_ji = ( + common_terms + +np.mean(digamma(n_i)) + -np.mean(digamma(n_iy))) + return max(0, cmi_ij), max(0, cmi_ji) + + +def _compute_cmip_cdc( + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike, + n_neighbors: int = 4) -> typing.Tuple[float]: + """ + Computes conditional mutual information pair `(I(x_i; y | x_j)` + and `(I(x_j; y | x_i)`. Random variables `x_i`, `y` are assumed + to be continuous, while `x_j` is assumed to be discrete. + + Adapted from https://github.com/jannisteunissen/mutual_information + """ + xi = xi.reshape((-1, 1)) + y = y.reshape((-1, 1)) + + (radius, + label_counts, + k_all) = _get_radius_k_neighbours_d( + xi, y, xj, + n_neighbors=n_neighbors) + + # Ignore points with unique labels. + mask = label_counts > 1 + label_counts = label_counts[mask] + k_all = k_all[mask] + xi = xi[mask] + xj = xj[mask] + y = y[mask] + radius = radius[mask] + + # KDTree is explicitly fit to allow for the querying of number of + # neighbors within a specified radius + + # continuous + n_i = _num_points_within_radius(xi, radius) + n_iy = _num_points_within_radius(np.hstack((xi, y)), radius) + + # mixed estimates + n_ij = _num_points_within_radius_cd(xi, xj, radius) + n_jy = _num_points_within_radius_cd(y, xj, radius) + + # discrete estimates + n_j = label_counts + + common_terms = ( + np.mean(digamma(k_all)) + -np.mean(digamma(n_ij))) + cmi_ij = ( + common_terms + +np.mean(digamma(n_j)) + -np.mean(digamma(n_jy))) + cmi_ji = ( + common_terms + +np.mean(digamma(n_i)) + -np.mean(digamma(n_iy))) + return max(0, cmi_ij), max(0, cmi_ji) + + +def _get_radius_k_neighbours_d( + c1: npt.ArrayLike, + c2: npt.ArrayLike, + d: npt.ArrayLike, + n_neighbors: int = 4 + ) -> typing.Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]: + + c1 = c1.reshape((-1, 1)) + c2 = c2.reshape((-1, 1)) + n_samples = c1.shape[0] + radius = np.empty(n_samples) + label_counts = np.empty(n_samples) + k_all = np.empty(n_samples) + nn = NearestNeighbors(metric="chebyshev") + + for label in np.unique(d): + mask = d == label + count = np.sum(mask) + if count > 1: + k = min(n_neighbors, count - 1) + nn.set_params(n_neighbors=k) + nn.fit(np.hstack((c1[mask], c2[mask]))) + r = nn.kneighbors()[0] + radius[mask] = np.nextafter(r[:, -1], 0) + k_all[mask] = k + label_counts[mask] = np.sum(mask) + return radius, label_counts, k_all def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike): @@ -498,3 +512,42 @@ def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike): kd = KDTree(x, metric="chebyshev") nx = kd.query_radius(x, radius, count_only=True, return_distance=False) return np.array(nx) + + +def _num_points_within_radius_cd( + c: npt.ArrayLike, + d: npt.ArrayLike, + radius: npt.ArrayLike) -> npt.ArrayLike: + """ + For each point, determine the number of other points within a given radius + Inspired by _compute_mi_cd + """ + c = c.reshape((-1, 1)) + n_samples = c.shape[0] + m_all = np.empty(n_samples) + for label in np.unique(d): + mask = d == label + kd = KDTree(c[mask], metric="chebyshev") + m_all[mask] = kd.query_radius( + c[mask], radius[mask], + count_only=True, return_distance=False) + return m_all + + +@contextlib.contextmanager +def tqdm_joblib(tqdm_object): + """Context manager to patch joblib to report into tqdm progress bar given as argument + from https://stackoverflow.com/a/58936697/5133167 + """ + class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack): + def __call__(self, *args, **kwargs): + tqdm_object.update(n=self.batch_size) + return super().__call__(*args, **kwargs) + + old_batch_callback = joblib.parallel.BatchCompletionCallBack + joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback + try: + yield tqdm_object + finally: + joblib.parallel.BatchCompletionCallBack = old_batch_callback + tqdm_object.close() diff --git a/requirements.txt b/requirements.txt index 00814cb..43cc4f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,8 @@ oldest-supported-numpy; python_version>="3.11.0" scikit-learn==1.2.0; python_version >= '3.8' scikit-learn==1.0.2; python_version < '3.8' +joblib>=0.11 +scipy>=1.7.3 +tqdm>=4.65.2 reno==3.5.0 diff --git a/setup.cfg b/setup.cfg index 269ee72..e84055b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,7 +23,10 @@ install_requires = dimod>=0.12.3 dwave-system>=1.18.0 numpy>=1.20.0 - scikit-learn>=1.0.2 + scikit-learn>=1.0.2 + scipy>=1.7.3 + joblib>=0.11 + tqdm>=4.65.0 packages = dwave dwave.plugins diff --git a/tests/test_transformer.py b/tests/test_transformer.py index 6e0ef3e..abfa25f 100644 --- a/tests/test_transformer.py +++ b/tests/test_transformer.py @@ -122,12 +122,12 @@ def test_pipeline(self): clf.predict(X) - def test_alpha_0(self): - cqm = SelectFromQuadraticModel.correlation_cqm(self.X, self.y, num_features=3, alpha=0) + def test_alpha_0(self): + cqm = SelectFromQuadraticModel().correlation_cqm(X=self.X, y=self.y, num_features=3, alpha=0) self.assertTrue(not any(cqm.objective.linear.values())) def test_alpha_1_no_quadratic(self): - cqm = SelectFromQuadraticModel.correlation_cqm(self.X, self.y, num_features=3, alpha=1) + cqm = SelectFromQuadraticModel().correlation_cqm(X=self.X, y=self.y, num_features=3, alpha=1) self.assertTrue(not any(cqm.objective.quadratic.values())) def test_alpha_1(self): @@ -183,7 +183,7 @@ def test_fixed_column(self): X[:, 1] = 0 X[:, 5] = 1 - cqm = SelectFromQuadraticModel.correlation_cqm(X, self.y, alpha=.5, num_features=5) + cqm = SelectFromQuadraticModel().correlation_cqm(X=X, y=self.y, alpha=.5, num_features=5) # in this case the linear bias for those two columns should be 0 self.assertEqual(cqm.objective.linear[1], 0) diff --git a/tests/test_utilities.py b/tests/test_utilities.py index df2a35e..aadaaef 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -51,7 +51,11 @@ import numpy as np -from dwave.plugins.sklearn.utilities import corrcoef, cov, dot_2d, _compute_cmi_c, _compute_cmi_d +from dwave.plugins.sklearn.utilities import ( + corrcoef, cov, dot_2d, + _compute_cmip_c, _compute_cmip_d, + _compute_cmip_cdc, _compute_cmip_ccd +) from scipy.special import digamma from sklearn.feature_selection._mutual_info import _compute_mi from sklearn.neighbors import KDTree @@ -183,28 +187,60 @@ def test_memmap(self): class TestMI(unittest.TestCase): + def test_cmi_cdc(self): + np.random.seed(42) + n_samples, n_neighbors = 4003, 4 + c1 = np.random.randn(n_samples,) + n_ss = 1001 + d1 = np.hstack(( + np.random.randint(-2, 1, (n_ss, )), + np.random.randint(10, 14, (n_samples-n_ss, )))) + d2 = np.random.randint(0, 4, (n_samples, )) + np.random.shuffle(d2) + cmi_ij_pair = _compute_cmip_c(c1, d1, d2, n_neighbors=n_neighbors) + cmi_ij_pair_cdc = _compute_cmip_cdc(c1, d1, d2, n_neighbors=n_neighbors) + cmi_ij_pair_ccd = _compute_cmip_ccd(c1, d1, d2, n_neighbors=n_neighbors) + self.assertAlmostEqual( + sum(cmi_ij_pair), sum(cmi_ij_pair_cdc), places=4, + msg="Computation for mixed continuous/discrete conditional mutual information is not accurate") + self.assertAlmostEqual( + sum(cmi_ij_pair), sum(cmi_ij_pair_ccd), places=4, + msg="Computation for mixed continuous/discrete conditional mutual information is not accurate") + + def test_cmi_symmetry(self): + np.random.seed(42) + n_samples, n_neighbors = 4003, 4 + # Test continuous implementation + Xy = np.random.randn(n_samples, 3) + cmi_ij_pair = _compute_cmip_c(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors) + cmi_ji_pair = _compute_cmip_c(Xy[:, 1], Xy[:, 0], Xy[:, 2], n_neighbors=n_neighbors) + self.assertAlmostEqual( + sum(cmi_ij_pair), sum(cmi_ji_pair), places=3, + msg="Computation for continuous conditional mutual information is not symmetric") + + c1 = np.random.randn(n_samples,) + c2 = np.random.randn(n_samples,) + n_ss = 1001 + d = np.hstack(( + np.random.randint(-2, 1, (n_ss, )), + np.random.randint(10, 14, (n_samples-n_ss, )))) + np.random.shuffle(d) + cmi_ij_pair = _compute_cmip_ccd(c1, c2, d, n_neighbors=n_neighbors) + cmi_ji_pair = _compute_cmip_ccd(c2, c1, d, n_neighbors=n_neighbors) + self.assertAlmostEqual( + sum(cmi_ij_pair), sum(cmi_ji_pair), places=3, + msg="Computation for mixed continuous/discrete conditional mutual information is not symmetric") + def test_cmi(self): """ We test the algorithm using the formula `I(x;y|z) = I(x;y) + I(z;y) - I(z;y|x)`. Since the mutual information data is an sklearn function, :func:`sklearn.feature_selection._mutual_info._compute_mi` it is highly likely that formula is valid only if our algorithm is correct. - For continous variables the formula may still break for some seeds """ np.random.seed(42) - n_samples, n_neighbors = 4003, 4 - # Test continuous implementation - Xy = np.random.randn(n_samples, 3) - cmi_ij = _compute_cmi_c(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors) - cmi_ji = _compute_cmi_c(Xy[:, 1], Xy[:, 0], Xy[:, 2], n_neighbors=n_neighbors) - mi_i = _compute_mi(Xy[:, 0], Xy[:, 2], False, False, n_neighbors=n_neighbors) - mi_j = _compute_mi(Xy[:, 1], Xy[:, 2], False, False, n_neighbors=n_neighbors) - self.assertAlmostEqual( - max(cmi_ij + mi_j - mi_i, 0), cmi_ji, places=3, - msg="The formula for continuous conditional mutual information is violated") - # Test discrete implementation - n_samples, n_ss = 103, 51 + n_samples, n_ss, n_neighbors = 103, 51, 4 xi = np.random.randint(0, 11, (n_samples, )) xj = np.hstack(( np.random.randint(-2, 1, (n_ss, )), @@ -212,13 +248,16 @@ def test_cmi(self): np.random.shuffle(xi) np.random.shuffle(xj) y = np.random.randint(-12, -10, (n_samples, )) - cmi_ij = _compute_cmi_d(xi, xj, y) - cmi_ji = _compute_cmi_d(xj, xi, y) + cmi_ij, cmi_ji = _compute_cmip_d(xi, xj, y) mi_i = _compute_mi(xi, y, True, True, n_neighbors=n_neighbors) mi_j = _compute_mi(xj, y, True, True, n_neighbors=n_neighbors) self.assertAlmostEqual( cmi_ij + mi_j - mi_i, cmi_ji, places=5, msg="The formula for discrete conditional mutual information is violated") + cmi_ij2 = _compute_cmip_d(xj, xi, y) + self.assertAlmostEqual( + sum(cmi_ij2), cmi_ji+cmi_ij, places=5, + msg="Discrete conditional mutual information computation is not symmetric") def test_implementation(self): # methods from https://github.com/jannisteunissen/mutual_information @@ -231,7 +270,14 @@ def _compute_cmi_t(x, z, y, n_neighbors): xyz = np.hstack((x, y, z)) k = np.full(n_samples, n_neighbors) radius = get_radius_kneighbors(xyz, n_neighbors) - + + mask = (radius == 0) + if mask.sum() > 0: + vals, ix, counts = np.unique(xyz[mask], axis=0, + return_inverse=True, + return_counts=True) + k[mask] = counts[ix] - 1 + nxz = num_points_within_radius(np.hstack((x, z)), radius) nyz = num_points_within_radius(np.hstack((y, z)), radius) nz = num_points_within_radius(z, radius) @@ -267,14 +313,20 @@ def num_points_within_radius(x, radius): kd = KDTree(x, metric="chebyshev") nx = kd.query_radius(x, radius, count_only=True, return_distance=False) return np.array(nx) - 1.0 - + + np.random.seed(42) n_samples, n_neighbors = 103, 4 # Test continuous implementation Xy = np.random.randn(n_samples, 3) - cmi_t = _compute_cmi_t(Xy[:,0], Xy[:,1], Xy[:,2], n_neighbors=n_neighbors) + cmi_t_01 = _compute_cmi_t(Xy[:,0], Xy[:,1], Xy[:,2], n_neighbors=n_neighbors) + cmi_t_10 = _compute_cmi_t(Xy[:,1], Xy[:,0], Xy[:,2], n_neighbors=n_neighbors) - cmi_c = _compute_cmi_c(Xy[:,0], Xy[:,1], Xy[:,2], n_neighbors=n_neighbors) + cmi_ij, cmi_ji = _compute_cmip_c(Xy[:,0], Xy[:,1], Xy[:,2], n_neighbors=n_neighbors) + + self.assertAlmostEqual( + max(cmi_t_01, 0), cmi_ij, places=5, + msg="The algorithm doesn't match the original implementation") self.assertAlmostEqual( - cmi_t, cmi_c, places=5, + max(cmi_t_10, 0), cmi_ji, places=5, msg="The algorithm doesn't match the original implementation") From 93b8b88717320131c1de12daf0f5c37692998d63 Mon Sep 17 00:00:00 2001 From: Aivar Sootla Date: Wed, 10 Apr 2024 17:12:59 +0100 Subject: [PATCH 05/12] addressing the first pass of the PR comments --- dwave/plugins/sklearn/transformers.py | 139 ++++++++++++++------------ dwave/plugins/sklearn/utilities.py | 131 +++++++++++++++--------- requirements.txt | 5 +- setup.cfg | 6 +- tests/test_utilities.py | 88 ++++++++++++++-- 5 files changed, 235 insertions(+), 134 deletions(-) diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py index 88dc149..1333e00 100644 --- a/dwave/plugins/sklearn/transformers.py +++ b/dwave/plugins/sklearn/transformers.py @@ -14,25 +14,29 @@ from __future__ import annotations +from inspect import signature from itertools import combinations import tempfile import typing import dimod -from dwave.plugins.sklearn.utilities import corrcoef, _compute_off_diagonal_mi, tqdm_joblib from dwave.cloud.exceptions import ConfigFileError, SolverAuthenticationError +from dwave.plugins.sklearn.utilities import ( + corrcoef, + _compute_off_diagonal_cmi, + _compute_off_diagonal_mi, + _iterate_columns) from dwave.system import LeapHybridCQMSampler -from joblib import Parallel, delayed, cpu_count import numpy as np import numpy.typing as npt from scipy.sparse import issparse from sklearn.base import BaseEstimator from sklearn.feature_selection import SelectorMixin -from sklearn.feature_selection._mutual_info import _compute_mi, _iterate_columns +from sklearn.feature_selection import mutual_info_classif, mutual_info_regression from sklearn.preprocessing import scale from sklearn.utils import check_random_state +from sklearn.utils.parallel import Parallel, delayed from sklearn.utils.validation import check_is_fitted, check_array, check_X_y -from tqdm import tqdm __all__ = ["SelectFromQuadraticModel"] @@ -211,7 +215,7 @@ def mutual_information_cqm( discrete_target: bool = False, copy: bool = True, n_neighbors: int = 4, - n_workers: int = 1, + n_jobs: typing.Union[None, int] = None, random_state: typing.Union[None, int] = None, ) -> dimod.ConstrainedQuadraticModel: """Build a constrained quadratic model for feature selection. @@ -255,8 +259,12 @@ def mutual_information_cqm( See :func:`sklearn.feature_selection._mutual_info._estimate_mi` random_state: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` - n_workers: int, default=1 - Number of workers for parallel computation on the cpu + n_jobs: int, default=None + The number of parallel jobs to run for the conditional mutual information + computation. The parallelization is over the columns of `X`. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. Returns: A constrained quadratic model. @@ -284,20 +292,24 @@ def mutual_information_cqm( ) mi = estimate_mi_matrix( - X, y, discrete_features, discrete_target, + X, y, + discrete_features=discrete_features, + discrete_target=discrete_target, n_neighbors=n_neighbors, copy=copy, - random_state=random_state, n_workers=n_workers, + random_state=random_state, n_jobs=n_jobs, conditional=conditional) - if not conditional: - # mutliplying all features with num_features - np.multiply(mi, num_features, out=mi) - # mutpliypling off-diagonal ones with -(1-alpha) - np.multiply(mi, -(1 - alpha), out=mi) + if conditional: + np.multiply(mi, -1, out=mi) + else: + # mutpliypling off-diagonal ones with -1 + diagonal = -np.diag(mi).copy() # mutpliypling off-diagonal ones with alpha - diagonal = alpha * np.diag(mi) + np.multiply(mi, alpha, out=mi) np.fill_diagonal(mi, diagonal) + self.mi_matrix = mi + it = np.nditer(mi, flags=['multi_index'], op_flags=[['readonly']]) cqm.set_objective((*it.multi_index, x) for x in it if x) return cqm @@ -362,7 +374,8 @@ def fit( if self.method == "correlation": cqm = self.correlation_cqm(X, y, num_features=num_features, alpha=alpha) elif self.method == "mutual_information": - cqm = self.mutual_information_cqm(X, y, num_features=num_features, alpha=alpha, **kwargs) + cqm = self.mutual_information_cqm( + X, y, num_features=num_features, alpha=alpha, **kwargs) else: raise ValueError(f"only methods {self.acceptable_methods} are implemented") @@ -432,8 +445,7 @@ def estimate_mi_matrix( conditional: bool = True, copy: bool = True, random_state: typing.Union[None, int] = None, - n_workers: int = 1, - n_subsample: int = -1 + n_jobs: typing.Union[None, int] = None, ) -> npt.ArrayLike: """ For the feature array `X` and the target array `y` computes @@ -473,8 +485,12 @@ def estimate_mi_matrix( random_state: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` - n_workers: int, default=1 - Number of workers for parallel computation on the cpu + n_jobs: int, default=None + The number of parallel jobs to run for the conditional mutual information + computation. The parallelization is over the columns of `X`. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. Returns: mi_matrix : ndarray, shape (n_features, n_features) @@ -490,7 +506,6 @@ def estimate_mi_matrix( mutual information estimation for mixed, discrete and continuous data." IEEE Transactions on Information Theory 67.1 (2020): 464-484. """ - X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target) n_samples, n_features = X.shape @@ -504,64 +519,56 @@ def estimate_mi_matrix( discrete_mask.fill(discrete_features) else: discrete_features = check_array(discrete_features, ensure_2d=False) - if discrete_features.dtype != "bool": + if np.issubdtype(discrete_features.dtype, bool): discrete_mask = np.zeros(n_features, dtype=bool) discrete_mask[discrete_features] = True else: discrete_mask = discrete_features - continuous_mask = ~discrete_mask - if np.any(continuous_mask) and issparse(X): - raise ValueError("Sparse matrix `X` can't have continuous features.") - - rng = check_random_state(random_state) - if np.any(continuous_mask): - if copy: - X = X.copy() - - X[:, continuous_mask] = scale( - X[:, continuous_mask], with_mean=False, copy=False - ) - - # Add small noise to continuous features as advised in Kraskov et. al. - X = X.astype(np.float64, copy=False) - means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0)) - X[:, continuous_mask] += ( - 1e-10 - * means - * rng.standard_normal(size=(n_samples, np.sum(continuous_mask))) - ) - - if not discrete_target: + # copying X if needed + if copy: + X = X.copy() + + # Computing the diagonal terms + mutual_info_args = dict(X=X, y=y, + discrete_features=discrete_mask, + n_neighbors=n_neighbors, + copy=False, random_state=random_state) + # In earlier versions of sklearn functions `mutual_info_classif` and + # `mutual_info_regression` do not allow for parallel computations and + # do not have `n_jobs` argument. + if "n_jobs" in signature(mutual_info_classif).parameters and \ + "n_jobs" in signature(mutual_info_regression).parameters: + mutual_info_args["n_jobs"] = n_jobs + if discrete_target: + diagonal_vals = mutual_info_classif(**mutual_info_args) + else: + diagonal_vals = mutual_info_regression(**mutual_info_args) + # the target y is not modified even if copy=False + # https://github.com/scikit-learn/scikit-learn/issues/28793 + rng = check_random_state(random_state) y = scale(y, with_mean=False) y += ( 1e-10 * np.maximum(1, np.mean(np.abs(y))) * rng.standard_normal(size=n_samples) - ) - - n_features = X.shape[1] - max_n_workers = cpu_count()-1 - if n_workers > max_n_workers: - n_workers = max_n_workers - Warning(f"Specified number of workers {n_workers} is larger than the number of cpus." - f"Will use only {max_n_workers}.") - - mi_matrix = np.zeros((n_features, n_features), dtype=np.float64) - with tqdm_joblib(tqdm(desc="Computing off-diagonal terms", total=len(discrete_mask) * (len(discrete_mask) - 1) // 2 )) as progress_bar: - off_diagonal_vals = Parallel(n_jobs=n_workers)( - delayed(_compute_off_diagonal_mi) - (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors, conditional, n_subsample) - for (xi, discrete_feature_i), (xj, discrete_feature_j) in combinations(zip(_iterate_columns(X), discrete_mask), 2) - ) - diagonal_vals = Parallel(n_jobs=n_workers)( - delayed(_compute_mi) - (xi, y, discrete_feature_i, discrete_target, n_neighbors) - for xi, discrete_feature_i in zip(_iterate_columns(X), discrete_mask) ) + + mi_matrix = np.zeros((n_features, n_features), dtype=np.float64) np.fill_diagonal(mi_matrix, diagonal_vals) + off_diagonal_iter = combinations(zip(_iterate_columns(X), discrete_mask), 2) + if conditional: + off_diagonal_vals = Parallel(n_jobs=n_jobs, verbose=1)( + delayed(_compute_off_diagonal_cmi) + (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors) + for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter) + else: + off_diagonal_vals = Parallel(n_jobs=n_jobs, verbose=1)( + delayed(_compute_off_diagonal_mi) + (xi, xj, discrete_feature_i, discrete_feature_j, n_neighbors) + for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter) + off_diagonal_val = iter(off_diagonal_vals) for i, j in combinations(range(n_features), 2): mi_matrix[i, j] = mi_matrix[j, i] = next(off_diagonal_val) return mi_matrix - diff --git a/dwave/plugins/sklearn/utilities.py b/dwave/plugins/sklearn/utilities.py index 5910b53..dee7565 100644 --- a/dwave/plugins/sklearn/utilities.py +++ b/dwave/plugins/sklearn/utilities.py @@ -49,14 +49,13 @@ import typing -import contextlib -import joblib import numpy as np import numpy.typing as npt +from scipy.sparse import issparse from scipy.special import digamma from scipy.stats import entropy -from sklearn.feature_selection._mutual_info import _compute_mi from sklearn.neighbors import NearestNeighbors, KDTree +from sklearn.feature_selection import mutual_info_classif, mutual_info_regression __all__ = ["corrcoef", "cov", "dot_2d"] @@ -232,7 +231,7 @@ def dot_2d(a: npt.ArrayLike, b: npt.ArrayLike, *, return out -def _compute_off_diagonal_mi( +def _compute_off_diagonal_cmi( xi: npt.ArrayLike, xj: npt.ArrayLike, y: npt.ArrayLike, @@ -240,16 +239,10 @@ def _compute_off_diagonal_mi( discrete_feature_j: bool, discrete_target: bool, n_neighbors: int = 4, - conditional: bool = True, - n_subsample: int = -1 ): """ - Computing a distance `d` between features `x_i` and `x_j`. - If `conditional = True`, then conditional mutual infomation is used - `I(x_i; y | x_j)`. - - If `conditonal = False` then mutual information is used - `I(x_i; x_j)`. + Computing a distance `d` based on the conditional mutual infomation + between features `x_i` and `x_j`: `d = (I(x_i; y | x_j)+I(x_j; y | x_i))/2`. References ---------- @@ -261,25 +254,45 @@ def _compute_off_diagonal_mi( mutual information estimation for mixed, discrete and continuous data." IEEE Transactions on Information Theory 67.1 (2020): 464-484. """ - if conditional: - if n_subsample > 0: - idx = np.arange(xi.shape[0]) - np.random.shuffle(idx) - idx = idx[:n_subsample] - xi, xj, y = xi[idx], xj[idx], y[idx] - if discrete_feature_i and discrete_feature_j and discrete_target: - val = _compute_cmip_d(xi, xj, y) - elif n_subsample >= 0 and discrete_target: - val = _compute_cmip_ccd(xi, xj, y, n_neighbors) - elif n_subsample >= 0 and discrete_feature_i and (not discrete_target): - val = _compute_cmip_cdc(xj, xi, y, n_neighbors) - elif n_subsample >= 0 and (not discrete_feature_i) and discrete_feature_j and (not discrete_target): - val = _compute_cmip_cdc(xi, xj, y, n_neighbors) - else: - val = _compute_cmip_c(xi, xj, y, n_neighbors) - return sum(val) / 2 + if discrete_feature_i and discrete_feature_j and discrete_target: + val = _compute_cmip_d(xi, xj, y) + elif discrete_target: + val = _compute_cmip_ccd(xi, xj, y, n_neighbors) + elif discrete_feature_i and (not discrete_target): + val = _compute_cmip_cdc(xj, xi, y, n_neighbors) + elif (not discrete_feature_i) and discrete_feature_j and (not discrete_target): + val = _compute_cmip_cdc(xi, xj, y, n_neighbors) else: - return _compute_mi(xi, xj, discrete_feature_i, discrete_feature_j, n_neighbors) + val = _compute_cmip_c(xi, xj, y, n_neighbors) + return sum(val) / 2 + + +def _compute_off_diagonal_mi( + xi: npt.ArrayLike, + xj: npt.ArrayLike, + discrete_feature_i: bool, + discrete_feature_j: bool, + n_neighbors: int = 4, + ): + """ + Compute mutual information between features `I(x_i; x_j)`. + + References + ---------- + .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual + information". Phys. Rev. E 69, 2004. + .. [2] B. C. Ross "Mutual Information between Discrete and Continuous + Data Sets". PLoS ONE 9(2), 2014. + """ + if discrete_feature_j: + return mutual_info_classif( + xi.reshape(-1, 1), xj, discrete_features=[discrete_feature_i], n_neighbors=n_neighbors) + elif discrete_feature_i: + return mutual_info_classif( + xj.reshape(-1, 1), xi, discrete_features=[discrete_feature_j], n_neighbors=n_neighbors) + else: + return mutual_info_regression( + xi.reshape(-1, 1), xj, discrete_features=[discrete_feature_i], n_neighbors=n_neighbors) def _compute_cmip_d( @@ -368,7 +381,8 @@ def _compute_cmip_ccd( and `(I(x_j; y | x_i)`. Random variables `x_i`, `x_j` are assumed to be continuous, while `y` is assumed to be discrete. - Adapted from https://github.com/jannisteunissen/mutual_information + Adapted from https://github.com/jannisteunissen/mutual_information and + :func:`sklearn.feature_selection._mutual_info._compute_mi_cd` """ xi = xi.reshape((-1, 1)) xj = xj.reshape((-1, 1)) @@ -423,7 +437,8 @@ def _compute_cmip_cdc( and `(I(x_j; y | x_i)`. Random variables `x_i`, `y` are assumed to be continuous, while `x_j` is assumed to be discrete. - Adapted from https://github.com/jannisteunissen/mutual_information + Adapted from https://github.com/jannisteunissen/mutual_information and + :func:`sklearn.feature_selection._mutual_info._compute_mi_cd` """ xi = xi.reshape((-1, 1)) y = y.reshape((-1, 1)) @@ -477,7 +492,10 @@ def _get_radius_k_neighbours_d( d: npt.ArrayLike, n_neighbors: int = 4 ) -> typing.Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]: - + """ + Determine smallest radius around x containing n_neighbors neighbors + Inspired by :func:`sklearn.feature_selection._mutual_info._compute_mi_cd` + """ c1 = c1.reshape((-1, 1)) c2 = c2.reshape((-1, 1)) n_samples = c1.shape[0] @@ -520,7 +538,7 @@ def _num_points_within_radius_cd( radius: npt.ArrayLike) -> npt.ArrayLike: """ For each point, determine the number of other points within a given radius - Inspired by _compute_mi_cd + Inspired by :func:`sklearn.feature_selection._mutual_info._compute_mi_cd` """ c = c.reshape((-1, 1)) n_samples = c.shape[0] @@ -534,20 +552,33 @@ def _num_points_within_radius_cd( return m_all -@contextlib.contextmanager -def tqdm_joblib(tqdm_object): - """Context manager to patch joblib to report into tqdm progress bar given as argument - from https://stackoverflow.com/a/58936697/5133167 - """ - class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack): - def __call__(self, *args, **kwargs): - tqdm_object.update(n=self.batch_size) - return super().__call__(*args, **kwargs) +def _iterate_columns(X, columns=None): + """Iterate over columns of a matrix. + Copied from :func:`sklearn.feature_selection._mutual_info._iterate_columns` - old_batch_callback = joblib.parallel.BatchCompletionCallBack - joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback - try: - yield tqdm_object - finally: - joblib.parallel.BatchCompletionCallBack = old_batch_callback - tqdm_object.close() + Parameters + ---------- + X : ndarray or csc_matrix, shape (n_samples, n_features) + Matrix over which to iterate. + + columns : iterable or None, default=None + Indices of columns to iterate over. If None, iterate over all columns. + + Yields + ------ + x : ndarray, shape (n_samples,) + Columns of `X` in dense format. + """ + if columns is None: + columns = range(X.shape[1]) + + if issparse(X): + for i in columns: + x = np.zeros(X.shape[0]) + start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1] + x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr] + yield x + else: + for i in columns: + yield X[:, i] + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 43cc4f6..ee3a4a4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,10 +5,7 @@ numpy==1.20.0; python_version<"3.10.0" # oldest supported by dwave-system numpy==1.22.0; python_version~="3.10.0" # needed for scipy on windows oldest-supported-numpy; python_version>="3.11.0" -scikit-learn==1.2.0; python_version >= '3.8' -scikit-learn==1.0.2; python_version < '3.8' -joblib>=0.11 +scikit-learn==1.2.1 scipy>=1.7.3 -tqdm>=4.65.2 reno==3.5.0 diff --git a/setup.cfg b/setup.cfg index e84055b..b03a6f0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,15 +23,13 @@ install_requires = dimod>=0.12.3 dwave-system>=1.18.0 numpy>=1.20.0 - scikit-learn>=1.0.2 + scikit-learn>=1.2.1 scipy>=1.7.3 - joblib>=0.11 - tqdm>=4.65.0 packages = dwave dwave.plugins dwave.plugins.sklearn -python_requires = >=3.7 +python_requires = >=3.8 [pycodestyle] max-line-length = 100 diff --git a/tests/test_utilities.py b/tests/test_utilities.py index aadaaef..0ab57c7 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -47,18 +47,29 @@ import os.path import tempfile +import typing import unittest import numpy as np +import numpy.typing as npt from dwave.plugins.sklearn.utilities import ( - corrcoef, cov, dot_2d, - _compute_cmip_c, _compute_cmip_d, + corrcoef, cov, dot_2d, + _compute_off_diagonal_cmi, + _compute_off_diagonal_mi, + _compute_cmip_c, _compute_cmip_d, _compute_cmip_cdc, _compute_cmip_ccd ) +from dwave.plugins.sklearn.transformers import estimate_mi_matrix +from scipy.linalg import norm +from scipy.sparse import issparse from scipy.special import digamma from sklearn.feature_selection._mutual_info import _compute_mi +from sklearn.feature_selection import mutual_info_classif, mutual_info_regression from sklearn.neighbors import KDTree +from sklearn.preprocessing import scale +from sklearn.utils import check_random_state +from sklearn.utils.validation import check_array, check_X_y class TestCorrCoef(unittest.TestCase): def test_agreement(self): @@ -235,7 +246,7 @@ def test_cmi(self): """ We test the algorithm using the formula `I(x;y|z) = I(x;y) + I(z;y) - I(z;y|x)`. Since the mutual information data is an sklearn function, - :func:`sklearn.feature_selection._mutual_info._compute_mi` + :func:`sklearn.feature_selection._mutual_info.mutual_info_classif` it is highly likely that formula is valid only if our algorithm is correct. """ np.random.seed(42) @@ -249,16 +260,73 @@ def test_cmi(self): np.random.shuffle(xj) y = np.random.randint(-12, -10, (n_samples, )) cmi_ij, cmi_ji = _compute_cmip_d(xi, xj, y) - mi_i = _compute_mi(xi, y, True, True, n_neighbors=n_neighbors) - mi_j = _compute_mi(xj, y, True, True, n_neighbors=n_neighbors) - self.assertAlmostEqual( - cmi_ij + mi_j - mi_i, cmi_ji, places=5, - msg="The formula for discrete conditional mutual information is violated") + mi_i = mutual_info_classif( + xi.reshape(-1, 1), y, discrete_features=True, n_neighbors=n_neighbors) + mi_j = mutual_info_classif( + xj.reshape(-1, 1), y, discrete_features=True, n_neighbors=n_neighbors) + self.assertTrue( + np.allclose(cmi_ij + mi_j - mi_i, cmi_ji, atol=1e-5), + msg="The chain rule for discrete conditional mutual information is violated") cmi_ij2 = _compute_cmip_d(xj, xi, y) - self.assertAlmostEqual( - sum(cmi_ij2), cmi_ji+cmi_ij, places=5, + self.assertTrue( + np.allclose(sum(cmi_ij2), cmi_ji+cmi_ij, atol=1e-5), msg="Discrete conditional mutual information computation is not symmetric") + def test_mi(self): + np.random.seed(42) + # Test discrete implementation + n_samples, n_ss, n_neighbors = 103, 51, 4 + xi = np.random.randint(0, 11, (n_samples, )) + xj = np.hstack(( + np.random.randint(-2, 1, (n_ss, )), + np.random.randint(10, 14, (n_samples-n_ss, )))) + np.random.shuffle(xi) + np.random.shuffle(xj) + mi_ij = _compute_off_diagonal_mi( + xi.reshape(-1, 1), xj, True, True, n_neighbors=n_neighbors) + mi_ji = _compute_off_diagonal_mi( + xi.reshape(-1, 1), xj, True, True, n_neighbors=n_neighbors) + mi_ij_cl = mutual_info_classif(xi.reshape(-1, 1), xj, discrete_features=True) + self.assertTrue( + np.allclose(mi_ij_cl, mi_ij), + msg="Discrete mutual information is not computed correctly") + self.assertTrue( + np.allclose(mi_ij_cl, mi_ji), + msg="Discrete mutual information is not computed correctly") + + mi_ij = _compute_off_diagonal_mi( + xi, xj, False, True, n_neighbors=n_neighbors) + mi_ji = _compute_off_diagonal_mi( + xi, xj, False, True, n_neighbors=n_neighbors) + mi_ij_cl = mutual_info_classif( + xi.reshape(-1, 1), xj, discrete_features=False) + self.assertTrue( + np.allclose(mi_ij_cl, mi_ij), + msg="Mixed mutual information is not computed correctly") + self.assertTrue( + np.allclose(mi_ij_cl, mi_ji), + msg="Mixed mutual information is not computed correctly") + + xj = np.random.randn(n_samples) + mi_ij = _compute_off_diagonal_mi( + xi, xj, True, False, n_neighbors=n_neighbors) + mi_ij_cl = mutual_info_regression( + xi.reshape(-1, 1), xj, + discrete_features=True, n_neighbors=n_neighbors) + self.assertTrue( + np.allclose(mi_ij_cl, mi_ij, atol=1e-5), + msg="Continuous mutual information is not computed correctly") + + xi = np.random.randn(n_samples) + mi_ij = _compute_off_diagonal_mi( + xi, xj, False, False, n_neighbors=n_neighbors) + mi_ij_cl = mutual_info_regression( + xi.reshape(-1, 1), xj, + discrete_features=False, n_neighbors=n_neighbors) + self.assertTrue( + np.allclose(mi_ij_cl, mi_ij, atol=1e-3), + msg="Continuous mutual information is not computed correctly") + def test_implementation(self): # methods from https://github.com/jannisteunissen/mutual_information def _compute_cmi_t(x, z, y, n_neighbors): From 0ab51dd05ce4ce5786058af69aef41e22b095243 Mon Sep 17 00:00:00 2001 From: Aivar Sootla Date: Tue, 16 Apr 2024 15:39:39 +0100 Subject: [PATCH 06/12] Incoporate comments from sklearn, fix circle.ci, fix tests, fix formating --- .circleci/config.yml | 17 +- dwave/plugins/sklearn/transformers.py | 239 ++++++----- dwave/plugins/sklearn/utilities.py | 371 ++++++++++-------- .../notes/mutual-info-ddd1a23e320f88ae.yaml | 5 + requirements.txt | 2 +- setup.cfg | 4 +- tests/test_transformer.py | 4 +- tests/test_utilities.py | 107 ++--- 8 files changed, 387 insertions(+), 362 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 5c59d81..84a4ef1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -205,22 +205,13 @@ workflows: name: test-linux-<< matrix.python-version >> | << matrix.pip-constraints >> matrix: parameters: - python-version: &python-versions ["3.7.9", "3.8.6", "3.9.0", "3.10.0", "3.11.0"] + python-version: &python-versions ["3.8.6", "3.9.0", "3.10.0", "3.11.0"] pip-constraints: - - "scikit-learn==1.0.2" # lowest supported by package + - "scikit-learn==1.2.1" # lowest supported by package # - "scikit-learn~=1.0.0" # 1.0.2 is the highest in ~=1.0.0 - - "scikit-learn~=1.1.0" - - "scikit-learn~=1.2.0" # latest in current minor as of March 2023 + # - "scikit-learn~=1.1.0" + # - "scikit-learn~=1.2.0" # latest in current minor as of March 2023 - "scikit-learn~=1.0" # latest in current major - exclude: - # sklearn < 1.1.3 does not support Python 3.11 - - python-version: "3.11.0" - pip-constraints: "scikit-learn==1.0.2" - # sklearn > 1.0.2 does not support Python 3.7 - - python-version: "3.7.9" - pip-constraints: "scikit-learn~=1.1.0" - - python-version: "3.7.9" - pip-constraints: "scikit-learn~=1.2.0" - test-macos: matrix: diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py index 1333e00..01275d0 100644 --- a/dwave/plugins/sklearn/transformers.py +++ b/dwave/plugins/sklearn/transformers.py @@ -14,7 +14,6 @@ from __future__ import annotations -from inspect import signature from itertools import combinations import tempfile import typing @@ -23,8 +22,8 @@ from dwave.cloud.exceptions import ConfigFileError, SolverAuthenticationError from dwave.plugins.sklearn.utilities import ( corrcoef, - _compute_off_diagonal_cmi, - _compute_off_diagonal_mi, + _compute_cmi_distance, + _compute_mi, _iterate_columns) from dwave.system import LeapHybridCQMSampler import numpy as np @@ -56,21 +55,36 @@ class SelectFromQuadraticModel(SelectorMixin, BaseEstimator): :class:`sklearn.feature_selection.SelectKBest`. num_features: The number of features to select. + method: + If equal to ``correlation`` uses a correlation as a criterion for + choosing the features according to [1]_. If equal to ``mutual_information`` + uses mutual information criteria [2]_ or [3]_ (advances feature). time_limit: The time limit for the run on the hybrid solver. + .. [1] [Milne et al.] Milne, Andrew, Maxwell Rounds, and Phil Goddard. 2017. "Optimal Feature + Selection in Credit Scoring and Classification Using a Quantum Annealer." + 1QBit; White Paper. + https://1qbit.com/whitepaper/optimal-feature-selection-in-credit-scoring-classification-using-quantum-annealer + .. [2] Peng, F. Long, and C. Ding. Feature selection based on mutual information criteria + of max-dependency, max-relevance, and min-redundancy. IEEE Transactions on pattern + analysis and machine intelligence, 27(8):1226–1238, 2005. + .. [3] X. V. Nguyen, J. Chan, S. Romano, and J. Bailey. Effective global approaches for + mutual information based feature selection. In Proceedings of the 20th ACM SIGKDD + international conference on Knowledge discovery and data mining, + pages 512–521. ACM, 2014. """ ACCEPTED_METHODS = [ "correlation", "mutual_information", - ] + ] def __init__( self, *, alpha: float = .5, - method: str = "correlation", # undocumented until there is another supported + method: str = "correlation", num_features: int = 10, time_limit: typing.Optional[float] = None, ): @@ -128,7 +142,7 @@ def correlation_cqm( """Build a constrained quadratic model for feature selection. This method is based on maximizing influence and independence as - measured by correlation [Milne et al.]_. + measured by correlation [1]_. Args: X: @@ -153,7 +167,7 @@ def correlation_cqm( Returns: A constrained quadratic model. - .. [Milne et al.] Milne, Andrew, Maxwell Rounds, and Phil Goddard. 2017. "Optimal Feature + .. [1] Milne, Andrew, Maxwell Rounds, and Phil Goddard. 2017. "Optimal Feature Selection in Credit Scoring and Classification Using a Quantum Annealer." 1QBit; White Paper. https://1qbit.com/whitepaper/optimal-feature-selection-in-credit-scoring-classification-using-quantum-annealer @@ -169,7 +183,7 @@ def correlation_cqm( '==' if strict else '<=', num_features, label=f"{num_features}-hot", - ) + ) with tempfile.TemporaryFile() as fX, tempfile.TemporaryFile() as fout: # we make a copy of X because we'll be modifying it in-place within @@ -184,7 +198,7 @@ def correlation_cqm( dtype=np.result_type(X, y), mode="w+", shape=(X_copy.shape[1], X_copy.shape[1]), - ) + ) # main calculation. It modifies X_copy in-place corrcoef(X_copy, out=correlations, rowvar=False, copy=False) # we don't care about the direction of correlation in terms of @@ -193,7 +207,7 @@ def correlation_cqm( # multiplying all but last columns and rows with (1 - alpha) np.multiply(correlations[:-1, :-1], (1 - alpha), out=correlations[:-1, :-1]) # our objective - # we multiply by num_features to have consistent performance + # we multiply by num_features to have consistent performance # with the increase of the number of features np.fill_diagonal(correlations, correlations[:, -1] * (- alpha * num_features)) # Note: we only add terms on and above the diagonal @@ -203,7 +217,7 @@ def correlation_cqm( return cqm def mutual_information_cqm( - self, + self, X: npt.ArrayLike, y: npt.ArrayLike, *, @@ -217,16 +231,16 @@ def mutual_information_cqm( n_neighbors: int = 4, n_jobs: typing.Union[None, int] = None, random_state: typing.Union[None, int] = None, - ) -> dimod.ConstrainedQuadraticModel: + ) -> dimod.ConstrainedQuadraticModel: """Build a constrained quadratic model for feature selection. - If ``conditional`` is True then the conditional mutual information + If ``conditional`` is True then the conditional mutual information criterion from [2] is used, and if ``conditional`` is False then mutual information based criterion from [1] is used. - - For computation of mutual information and conditional mutual information - - + + For computation of mutual information and conditional mutual information + + Args: X: Feature vectors formatted as a numerical 2D array-like. @@ -249,16 +263,16 @@ def mutual_information_cqm( conditional: bool, default=True Whether to compute the off-diagonal terms using the conditional mutual information or joint mutual information - discrete_features: - See :func:`sklearn.feature_selection._mutual_info._estimate_mi` - discrete_target: - See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + discrete_features: + See :func:`sklearn.feature_selection.mutual_info_regression` + discrete_target: bool, default=False + Whether the target variable `y` is discrete n_neighbors: - See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + See :func:`sklearn.feature_selection.mutual_info_regression` copy: - See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + See :func:`sklearn.feature_selection.mutual_info_regression` random_state: - See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + See :func:`sklearn.feature_selection.mutual_info_regression` n_jobs: int, default=None The number of parallel jobs to run for the conditional mutual information computation. The parallelization is over the columns of `X`. @@ -269,17 +283,18 @@ def mutual_information_cqm( Returns: A constrained quadratic model. - References: - .. [1] Peng, F. Long, and C. Ding. Feature selection based on mutual information criteria of max-dependency, - max-relevance, and min-redundancy. IEEE Transactions on pattern analysis and machine intelligence, - 27(8):1226–1238, 2005. - .. [2] X. V. Nguyen, J. Chan, S. Romano, and J. Bailey. Effective global approaches for mutual information - based feature selection. In Proceedings of the 20th ACM SIGKDD international conference on - Knowledge discovery and data mining, pages 512–521. ACM, 2014. + References: + .. [1] Peng, F. Long, and C. Ding. Feature selection based on mutual information criteria + of max-dependency, max-relevance, and min-redundancy. IEEE Transactions on pattern + analysis and machine intelligence, 27(8):1226–1238, 2005. + .. [2] X. V. Nguyen, J. Chan, S. Romano, and J. Bailey. Effective global approaches for + mutual information based feature selection. In Proceedings of the 20th ACM SIGKDD + international conference on Knowledge discovery and data mining, + pages 512–521. ACM, 2014. """ - + self._check_params(X, y, alpha, num_features) - + cqm = dimod.ConstrainedQuadraticModel() cqm.add_variables(dimod.BINARY, X.shape[1]) @@ -289,30 +304,30 @@ def mutual_information_cqm( '==' if strict else '<=', num_features, label=f"{num_features}-hot", - ) + ) mi = estimate_mi_matrix( - X, y, - discrete_features=discrete_features, + X, y, + discrete_features=discrete_features, discrete_target=discrete_target, n_neighbors=n_neighbors, copy=copy, random_state=random_state, n_jobs=n_jobs, conditional=conditional) - - if conditional: + + if conditional: + # method from [2]_. np.multiply(mi, -1, out=mi) else: + # method from [1]_. # mutpliypling off-diagonal ones with -1 diagonal = -np.diag(mi).copy() # mutpliypling off-diagonal ones with alpha - np.multiply(mi, alpha, out=mi) + np.multiply(mi, alpha, out=mi) np.fill_diagonal(mi, diagonal) - - self.mi_matrix = mi - + it = np.nditer(mi, flags=['multi_index'], op_flags=[['readonly']]) cqm.set_objective((*it.multi_index, x) for x in it if x) - return cqm + return cqm def fit( self, @@ -416,7 +431,7 @@ def _check_params(X: npt.ArrayLike, num_features: int): X = np.atleast_2d(np.asarray(X)) y = np.asarray(y) - + if X.ndim != 2: raise ValueError("X must be a 2-dimensional array-like") @@ -424,7 +439,8 @@ def _check_params(X: npt.ArrayLike, raise ValueError("y must be a 1-dimensional array-like") if y.shape[0] != X.shape[0]: - raise ValueError(f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}") + raise ValueError( + f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}") if not 0 <= alpha <= 1: raise ValueError(f"alpha must be between 0 and 1, given {alpha}") @@ -436,54 +452,63 @@ def _check_params(X: npt.ArrayLike, raise ValueError("X must have at least two rows") -def estimate_mi_matrix( +def estimate_mi_matrix( X: npt.ArrayLike, y: npt.ArrayLike, - discrete_features: typing.Union[str, npt.ArrayLike]="auto", + discrete_features: typing.Union[str, npt.ArrayLike] = "auto", discrete_target: bool = False, n_neighbors: int = 4, conditional: bool = True, copy: bool = True, random_state: typing.Union[None, int] = None, n_jobs: typing.Union[None, int] = None, - ) -> npt.ArrayLike: +) -> npt.ArrayLike: """ For the feature array `X` and the target array `y` computes - the matrix of (conditional) mutual information interactions. - - cmi_{i, j} = I(x_i; y) - + the matrix of (conditional) mutual information interactions. + The matrix is defined as follows: + + `M_(i, i) = I(x_i; y)` + If `conditional = True`, then the off-diagonal terms are computed: - - cmi_{i, j} = (I(x_i; y| x_j) + I(x_j; y| x_i)) / 2 - + + `M_(i, j) = (I(x_i; y| x_j) + I(x_j; y| x_i)) / 2` + Otherwise - - cmi_{i, j} = I(x_i; x_j) - - Computation of I(x; y) uses the scikit-learn implementation, i.e., - :func:`sklearn.feature_selection._mutual_info._estimate_mi`. The computation - of I(x; y| z) is based on - https://github.com/jannisteunissen/mutual_information + + `M_(i, j) = I(x_i; x_j)` + + Computation of I(x; y) uses modified scikit-learn methods. + The computation of I(x; y| z) is based on + https://github.com/jannisteunissen/mutual_information and [3]_. + + The method can be computationally expensive for a large number of features (> 1000) and + a large number of samples (> 100000). In this case, it can be advisable to downsample the + dataset. + + The estimation methods are linear in the number of outcomes (labels) of discrete distributions. + It may be beneficial to treat the discrete distrubitions with a large number of labels as + continuous distributions. Args: - X: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` - - y: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` - + X: See :func:`sklearn.feature_selection.mutual_info_regression` + + y: See :func:`sklearn.feature_selection.mutual_info_regression` + conditional: bool, default=True Whether to compute the off-diagonal terms using the conditional mutual information or joint mutual information - discrete_features: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` - - discrete_target: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` - - n_neighbors: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` - - copy: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` - - random_state: See :func:`sklearn.feature_selection._mutual_info._estimate_mi` + discrete_features: See :func:`sklearn.feature_selection.mutual_info_regression` + + discrete_target: bool, default=False + Whether the target variable `y` is discrete + + n_neighbors: See :func:`sklearn.feature_selection.mutual_info_regression` + + copy: See :func:`sklearn.feature_selection.mutual_info_regression` + + random_state: See :func:`sklearn.feature_selection.mutual_info_regression` n_jobs: int, default=None The number of parallel jobs to run for the conditional mutual information @@ -491,13 +516,13 @@ def estimate_mi_matrix( ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - + Returns: mi_matrix : ndarray, shape (n_features, n_features) Interaction matrix between the features using (conditional) mutual information. A negative value will be replaced by 0. - References: + References: .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual information". Phys. Rev. E 69, 2004. .. [2] B. C. Ross "Mutual Information between Discrete and Continuous @@ -508,7 +533,7 @@ def estimate_mi_matrix( """ X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target) n_samples, n_features = X.shape - + if isinstance(discrete_features, (str, bool)): if isinstance(discrete_features, str): if discrete_features == "auto": @@ -525,28 +550,26 @@ def estimate_mi_matrix( else: discrete_mask = discrete_features - # copying X if needed - if copy: - X = X.copy() - - # Computing the diagonal terms - mutual_info_args = dict(X=X, y=y, - discrete_features=discrete_mask, - n_neighbors=n_neighbors, - copy=False, random_state=random_state) - # In earlier versions of sklearn functions `mutual_info_classif` and - # `mutual_info_regression` do not allow for parallel computations and - # do not have `n_jobs` argument. - if "n_jobs" in signature(mutual_info_classif).parameters and \ - "n_jobs" in signature(mutual_info_regression).parameters: - mutual_info_args["n_jobs"] = n_jobs - if discrete_target: - diagonal_vals = mutual_info_classif(**mutual_info_args) - else: - diagonal_vals = mutual_info_regression(**mutual_info_args) - # the target y is not modified even if copy=False - # https://github.com/scikit-learn/scikit-learn/issues/28793 - rng = check_random_state(random_state) + continuous_mask = ~discrete_mask + if np.any(continuous_mask) and issparse(X): + raise ValueError("Sparse matrix `X` can't have continuous features.") + + rng = check_random_state(random_state) + if np.any(continuous_mask): + X = X.astype(np.float64, copy=copy) + X[:, continuous_mask] = scale( + X[:, continuous_mask], with_mean=False, copy=False + ) + + # Add small noise to continuous features as advised in Kraskov et. al. + means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0)) + X[:, continuous_mask] += ( + 1e-10 + * means + * rng.standard_normal(size=(n_samples, np.sum(continuous_mask))) + ) + + if not discrete_target: y = scale(y, with_mean=False) y += ( 1e-10 @@ -555,19 +578,25 @@ def estimate_mi_matrix( ) mi_matrix = np.zeros((n_features, n_features), dtype=np.float64) + # Computing the diagonal terms + diagonal_vals = Parallel(n_jobs=n_jobs)( + delayed(_compute_mi)(x, y, discrete_feature, discrete_target, n_neighbors) + for x, discrete_feature in zip(_iterate_columns(X), discrete_mask) + ) np.fill_diagonal(mi_matrix, diagonal_vals) + # Computing the off-diagonal terms off_diagonal_iter = combinations(zip(_iterate_columns(X), discrete_mask), 2) if conditional: - off_diagonal_vals = Parallel(n_jobs=n_jobs, verbose=1)( - delayed(_compute_off_diagonal_cmi) + off_diagonal_vals = Parallel(n_jobs=n_jobs)( + delayed(_compute_cmi_distance) (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors) for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter) else: - off_diagonal_vals = Parallel(n_jobs=n_jobs, verbose=1)( - delayed(_compute_off_diagonal_mi) + off_diagonal_vals = Parallel(n_jobs=n_jobs)( + delayed(_compute_mi) (xi, xj, discrete_feature_i, discrete_feature_j, n_neighbors) for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter) - + off_diagonal_val = iter(off_diagonal_vals) for i, j in combinations(range(n_features), 2): mi_matrix[i, j] = mi_matrix[j, i] = next(off_diagonal_val) diff --git a/dwave/plugins/sklearn/utilities.py b/dwave/plugins/sklearn/utilities.py index dee7565..bbdf956 100644 --- a/dwave/plugins/sklearn/utilities.py +++ b/dwave/plugins/sklearn/utilities.py @@ -54,8 +54,8 @@ from scipy.sparse import issparse from scipy.special import digamma from scipy.stats import entropy +from sklearn.metrics.cluster import mutual_info_score from sklearn.neighbors import NearestNeighbors, KDTree -from sklearn.feature_selection import mutual_info_classif, mutual_info_regression __all__ = ["corrcoef", "cov", "dot_2d"] @@ -231,7 +231,7 @@ def dot_2d(a: npt.ArrayLike, b: npt.ArrayLike, *, return out -def _compute_off_diagonal_cmi( +def _compute_cmi_distance( xi: npt.ArrayLike, xj: npt.ArrayLike, y: npt.ArrayLike, @@ -239,11 +239,11 @@ def _compute_off_diagonal_cmi( discrete_feature_j: bool, discrete_target: bool, n_neighbors: int = 4, - ): +): """ Computing a distance `d` based on the conditional mutual infomation between features `x_i` and `x_j`: `d = (I(x_i; y | x_j)+I(x_j; y | x_i))/2`. - + References ---------- .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual @@ -255,99 +255,68 @@ def _compute_off_diagonal_cmi( data." IEEE Transactions on Information Theory 67.1 (2020): 464-484. """ if discrete_feature_i and discrete_feature_j and discrete_target: - val = _compute_cmip_d(xi, xj, y) - elif discrete_target: - val = _compute_cmip_ccd(xi, xj, y, n_neighbors) + ans = _compute_cmip_d(xi, xj, y) + elif discrete_target: + ans = _compute_cmip_ccd(xi, xj, y, n_neighbors) elif discrete_feature_i and (not discrete_target): - val = _compute_cmip_cdc(xj, xi, y, n_neighbors) + ans = _compute_cmip_cdc(xj, xi, y, n_neighbors) elif (not discrete_feature_i) and discrete_feature_j and (not discrete_target): - val = _compute_cmip_cdc(xi, xj, y, n_neighbors) + ans = _compute_cmip_cdc(xi, xj, y, n_neighbors) else: - val = _compute_cmip_c(xi, xj, y, n_neighbors) - return sum(val) / 2 - - -def _compute_off_diagonal_mi( - xi: npt.ArrayLike, - xj: npt.ArrayLike, - discrete_feature_i: bool, - discrete_feature_j: bool, - n_neighbors: int = 4, - ): - """ - Compute mutual information between features `I(x_i; x_j)`. - - References - ---------- - .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual - information". Phys. Rev. E 69, 2004. - .. [2] B. C. Ross "Mutual Information between Discrete and Continuous - Data Sets". PLoS ONE 9(2), 2014. - """ - if discrete_feature_j: - return mutual_info_classif( - xi.reshape(-1, 1), xj, discrete_features=[discrete_feature_i], n_neighbors=n_neighbors) - elif discrete_feature_i: - return mutual_info_classif( - xj.reshape(-1, 1), xi, discrete_features=[discrete_feature_j], n_neighbors=n_neighbors) - else: - return mutual_info_regression( - xi.reshape(-1, 1), xj, discrete_features=[discrete_feature_i], n_neighbors=n_neighbors) + ans = _compute_cmip_c(xi, xj, y, n_neighbors) + return np.mean(ans) def _compute_cmip_d( - xi: npt.ArrayLike, - xj: npt.ArrayLike, - y: npt.ArrayLike) -> typing.Tuple[float]: + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike) -> typing.Tuple[float]: """ - Computes conditional mutual infomation pair `I(x_i; y | x_j)` `I(x_j; y | x_i)`. + Computes conditional mutual infomation pair `I(x_i; y | x_j)` and `I(x_j; y | x_i)`. All random variables `x_i`, `x_j` and `y` are discrete. - + Adpated from https://github.com/dwave-examples/mutual-information-feature-selection """ - # Computing joint probability distribution for the features and the target bin_boundaries = [np.hstack((np.unique(col), np.inf)) for col in (xi, xj, y)] prob, _ = np.histogramdd(np.vstack((xi, xj, y)).T, bins=bin_boundaries) - Hijy = entropy(prob.flatten()) - Hij = entropy(prob.sum(axis=2).flatten()) + + # Computing entropy + Hijy = entropy(prob.flatten()) # H(x_i, x_j, y) + Hij = entropy(prob.sum(axis=2).flatten()) # H(x_i, x_j) cmi_ij = ( - Hij # H(x_i, x_j) - +entropy(prob.sum(axis=0).flatten()) # H(x_j, y) - -entropy(prob.sum(axis=(0,2))) # H(x_j) - -Hijy # H(x_i, x_j, y) + Hij-Hijy + + entropy(prob.sum(axis=0).flatten()) # H(x_j, y) + - entropy(prob.sum(axis=(0, 2))) # H(x_j) ) cmi_ji = ( - Hij # H(x_i, x_j) - +entropy(prob.sum(axis=1).flatten()) # H(x_i, y) - -entropy(prob.sum(axis=(1,2))) # H(x_i) - -Hijy # H(x_i, x_j, y) + Hij-Hijy + + entropy(prob.sum(axis=1).flatten()) # H(x_i, y) + - entropy(prob.sum(axis=(1, 2))) # H(x_i) ) return cmi_ij, cmi_ji def _compute_cmip_c( - xi: npt.ArrayLike, - xj: npt.ArrayLike, - y: npt.ArrayLike, - n_neighbors: int = 4) -> typing.Tuple[float]: + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike, + n_neighbors: int = 4) -> typing.Tuple[float]: """ - Computes conditional mutual information pair `(I(x_i; y | x_j)` - and `(I(x_j; y | x_i)`. All random variables `x_i`, `x_j` and `y` + Computes conditional mutual information pair `I(x_i; y | x_j)` + and `I(x_j; y | x_i)`. All random variables `x_i`, `x_j` and `y` are assumed to be continuous. - + Adapted from https://github.com/jannisteunissen/mutual_information """ - xi = xi.reshape((-1, 1)) + xi = xi.reshape((-1, 1)) xj = xj.reshape((-1, 1)) y = y.reshape((-1, 1)) - + # Here we rely on NearestNeighbors to select the fastest algorithm. - nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors) - nn.fit(np.hstack((xi, xj, y))) - radius = nn.kneighbors()[0] - radius = np.nextafter(radius[:, -1], 0) - + radius = _get_radius_k_neighbours(np.hstack((xi, xj, y)), + n_neighbors=n_neighbors) + # KDTree is explicitly fit to allow for the querying of number of # neighbors within a specified radius n_j = _num_points_within_radius(xj, radius) @@ -357,41 +326,28 @@ def _compute_cmip_c( n_i = _num_points_within_radius(xi, radius) n_iy = _num_points_within_radius(np.hstack((y, xi)), radius) - cmi_ij = ( - digamma(n_neighbors) - +np.mean(digamma(n_j)) - -np.mean(digamma(n_ij)) - -np.mean(digamma(n_jy))) - - cmi_ji = ( - digamma(n_neighbors) - +np.mean(digamma(n_i)) - -np.mean(digamma(n_ij)) - -np.mean(digamma(n_iy))) - return max(0, cmi_ij), max(0, cmi_ji) + return _get_cmi_pair_from_estimates([n_neighbors], n_ij, n_j, n_jy, n_i, n_iy) def _compute_cmip_ccd( - xi: npt.ArrayLike, - xj: npt.ArrayLike, - y: npt.ArrayLike, - n_neighbors: int = 4) -> typing.Tuple[float]: + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike, + n_neighbors: int = 4) -> typing.Tuple[float]: """ - Computes conditional mutual information pair `(I(x_i; y | x_j)` - and `(I(x_j; y | x_i)`. Random variables `x_i`, `x_j` are assumed + Computes conditional mutual information pair `I(x_i; y | x_j)` + and `I(x_j; y | x_i)`. Random variables `x_i`, `x_j` are assumed to be continuous, while `y` is assumed to be discrete. - - Adapted from https://github.com/jannisteunissen/mutual_information and - :func:`sklearn.feature_selection._mutual_info._compute_mi_cd` + + Adapted from https://github.com/jannisteunissen/mutual_information and + :func:`sklearn.feature_selection.mutual_info_regression` """ - xi = xi.reshape((-1, 1)) + xi = xi.reshape((-1, 1)) xj = xj.reshape((-1, 1)) - (radius, - label_counts, - k_all) = _get_radius_k_neighbours_d( - xi, xj, y, - n_neighbors=n_neighbors) + # Here we rely on NearestNeighbors to select the fastest algorithm. + radius, label_counts, k_all = _get_radius_k_neighbours_d(np.hstack((xi, xj)), y, + n_neighbors=n_neighbors) # Ignore points with unique labels. mask = label_counts > 1 @@ -403,51 +359,36 @@ def _compute_cmip_ccd( radius = radius[mask] # KDTree is explicitly fit to allow for the querying of number of # neighbors within a specified radius - - # continuous + # continuous n_i = _num_points_within_radius(xi, radius) n_j = _num_points_within_radius(xj, radius) n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius) - - # mixed estimates + # mixed discrete/continuous estimates n_iy = _num_points_within_radius_cd(xi, y, radius) n_jy = _num_points_within_radius_cd(xj, y, radius) - - common_terms = ( - np.mean(digamma(k_all)) - -np.mean(digamma(n_ij))) - cmi_ij = ( - common_terms - +np.mean(digamma(n_j)) - -np.mean(digamma(n_jy))) - cmi_ji = ( - common_terms - +np.mean(digamma(n_i)) - -np.mean(digamma(n_iy))) - return max(0, cmi_ij), max(0, cmi_ji) + + return _get_cmi_pair_from_estimates(k_all, n_ij, n_j, n_jy, n_i, n_iy) def _compute_cmip_cdc( - xi: npt.ArrayLike, - xj: npt.ArrayLike, - y: npt.ArrayLike, - n_neighbors: int = 4) -> typing.Tuple[float]: + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike, + n_neighbors: int = 4) -> typing.Tuple[float]: """ - Computes conditional mutual information pair `(I(x_i; y | x_j)` - and `(I(x_j; y | x_i)`. Random variables `x_i`, `y` are assumed + Computes conditional mutual information pair `I(x_i; y | x_j)` + and `I(x_j; y | x_i)`. Random variables `x_i`, `y` are assumed to be continuous, while `x_j` is assumed to be discrete. - + Adapted from https://github.com/jannisteunissen/mutual_information and - :func:`sklearn.feature_selection._mutual_info._compute_mi_cd` - """ + :func:`sklearn.feature_selection.mutual_info_regression` + """ xi = xi.reshape((-1, 1)) y = y.reshape((-1, 1)) - (radius, - label_counts, - k_all) = _get_radius_k_neighbours_d( - xi, y, xj, - n_neighbors=n_neighbors) + # Here we rely on NearestNeighbors to select the fastest algorithm. + radius, label_counts, k_all = _get_radius_k_neighbours_d(np.hstack((xi, y)), xj, + n_neighbors=n_neighbors) # Ignore points with unique labels. mask = label_counts > 1 @@ -457,71 +398,163 @@ def _compute_cmip_cdc( xj = xj[mask] y = y[mask] radius = radius[mask] - + # KDTree is explicitly fit to allow for the querying of number of # neighbors within a specified radius - - # continuous + # continuous n_i = _num_points_within_radius(xi, radius) n_iy = _num_points_within_radius(np.hstack((xi, y)), radius) - - # mixed estimates + # mixed coninuous/discrete estimates n_ij = _num_points_within_radius_cd(xi, xj, radius) n_jy = _num_points_within_radius_cd(y, xj, radius) - # discrete estimates n_j = label_counts - - common_terms = ( - np.mean(digamma(k_all)) - -np.mean(digamma(n_ij))) - cmi_ij = ( - common_terms - +np.mean(digamma(n_j)) - -np.mean(digamma(n_jy))) - cmi_ji = ( - common_terms - +np.mean(digamma(n_i)) - -np.mean(digamma(n_iy))) + + return _get_cmi_pair_from_estimates(k_all, n_ij, n_j, n_jy, n_i, n_iy) + + +def _get_cmi_pair_from_estimates( + k_all: npt.ArrayLike, n_ij: npt.ArrayLike, n_j: npt.ArrayLike, + n_jy: npt.ArrayLike, n_i: npt.ArrayLike, n_iy: npt.ArrayLike) -> typing.Tuple[float]: + + common_terms = np.mean(digamma(k_all))-np.mean(digamma(n_ij)) + cmi_ij = common_terms+np.mean(digamma(n_j))-np.mean(digamma(n_jy)) + cmi_ji = common_terms+np.mean(digamma(n_i))-np.mean(digamma(n_iy)) return max(0, cmi_ij), max(0, cmi_ji) +def _compute_mi( + x: npt.ArrayLike, + y: npt.ArrayLike, + discrete_feature_x: bool, + discrete_feature_y: bool, + n_neighbors: int = 4, +): + """ + Compute mutual information between features `I(x_i; x_j)`. + + References + ---------- + .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual + information". Phys. Rev. E 69, 2004. + .. [2] B. C. Ross "Mutual Information between Discrete and Continuous + Data Sets". PLoS ONE 9(2), 2014. + """ + if discrete_feature_y and discrete_feature_x: + return mutual_info_score(x, y) + elif discrete_feature_x: + return _compute_mi_cd(y, x, n_neighbors=n_neighbors) + elif discrete_feature_y: + return _compute_mi_cd(x, y, n_neighbors=n_neighbors) + else: + return _compute_mi_cc(x, y, n_neighbors=n_neighbors) + + +def _compute_mi_cc(x: npt.ArrayLike, y: npt.ArrayLike, n_neighbors: int) -> float: + """Computes mutual information `I(x; y)`. Random variables `x`, `y` are assumed + to be continuous. + + Adapted from :func:`sklearn.feature_selection.mutual_info_regression` + """ + n_samples = x.size + + x = x.reshape((-1, 1)) + y = y.reshape((-1, 1)) + + # Here we rely on NearestNeighbors to select the fastest algorithm. + radius = _get_radius_k_neighbours(np.hstack((x, y)), n_neighbors) + + # KDTree is explicitly fit to allow for the querying of number of + # neighbors within a specified radius + nx = _num_points_within_radius(x, radius) + ny = _num_points_within_radius(y, radius) + + mi = ( + digamma(n_samples) + + digamma(n_neighbors) + - np.mean(digamma(nx)) + - np.mean(digamma(ny)) + ) + + return max(0, mi) + + +def _compute_mi_cd(x: npt.ArrayLike, y: npt.ArrayLike, n_neighbors: int) -> float: + """Computes mutual information `I(x; y)`. Random variable `x`, is assumed + to be continuous, while `y` is assumed to be discrete. + + Adapted from :func:`sklearn.feature_selection.mutual_info_regression` + """ + radius, label_counts, k_all = _get_radius_k_neighbours_d(x, y, n_neighbors=n_neighbors) + + # Ignore points with unique labels. + mask = label_counts > 1 + n_samples = np.sum(mask) + label_counts = label_counts[mask] + k_all = k_all[mask] + x = x[mask] + radius = radius[mask] + + m_all = _num_points_within_radius(x.reshape(-1, 1), radius) + + mi = ( + digamma(n_samples) + + np.mean(digamma(k_all)) + - np.mean(digamma(label_counts)) + - np.mean(digamma(m_all)) + ) + + return max(0, mi) + + +def _get_radius_k_neighbours(X: npt.ArrayLike, n_neighbors: int = 4) -> npt.ArrayLike: + """ + Determine smallest radius around `X` containing `n_neighbors` neighbors + Inspired by :func:`sklearn.feature_selection.mutual_info_regression` + """ + nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors) + return _get_radius_from_nn(X, nn) + + def _get_radius_k_neighbours_d( - c1: npt.ArrayLike, - c2: npt.ArrayLike, + c: npt.ArrayLike, d: npt.ArrayLike, n_neighbors: int = 4 - ) -> typing.Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]: +) -> typing.Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]: """ - Determine smallest radius around x containing n_neighbors neighbors - Inspired by :func:`sklearn.feature_selection._mutual_info._compute_mi_cd` + Determine smallest radius around `c` and `d` containing `n_neighbors` neighbors + Inspired by :func:`sklearn.feature_selection.mutual_info_regression` """ - c1 = c1.reshape((-1, 1)) - c2 = c2.reshape((-1, 1)) - n_samples = c1.shape[0] + d = d.flatten() + n_samples = d.shape[0] + c = c.reshape((n_samples, -1)) + radius = np.empty(n_samples) label_counts = np.empty(n_samples) k_all = np.empty(n_samples) nn = NearestNeighbors(metric="chebyshev") - for label in np.unique(d): - mask = d == label + mask = d == label count = np.sum(mask) if count > 1: k = min(n_neighbors, count - 1) nn.set_params(n_neighbors=k) - nn.fit(np.hstack((c1[mask], c2[mask]))) - r = nn.kneighbors()[0] - radius[mask] = np.nextafter(r[:, -1], 0) + radius[mask] = _get_radius_from_nn(c[mask], nn) k_all[mask] = k - label_counts[mask] = np.sum(mask) + label_counts[mask] = np.sum(mask) return radius, label_counts, k_all -def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike): +def _get_radius_from_nn(X: npt.ArrayLike, nn: NearestNeighbors) -> npt.ArrayLike: + nn.fit(X) + r = nn.kneighbors()[0] + return np.nextafter(r[:, -1], 0) + + +def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike) -> npt.ArrayLike: """For each point, determine the number of other points within a given radius Function from https://github.com/jannisteunissen/mutual_information - + :param x: ndarray, shape (n_samples, n_dim) :param radius: radius, shape (n_samples,) :returns: number of points within radius @@ -533,28 +566,25 @@ def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike): def _num_points_within_radius_cd( - c: npt.ArrayLike, - d: npt.ArrayLike, - radius: npt.ArrayLike) -> npt.ArrayLike: + c: npt.ArrayLike, + d: npt.ArrayLike, + radius: npt.ArrayLike) -> npt.ArrayLike: """ For each point, determine the number of other points within a given radius - Inspired by :func:`sklearn.feature_selection._mutual_info._compute_mi_cd` + Inspired by :func:`sklearn.feature_selection.mutual_info_regression` """ c = c.reshape((-1, 1)) n_samples = c.shape[0] m_all = np.empty(n_samples) for label in np.unique(d): mask = d == label - kd = KDTree(c[mask], metric="chebyshev") - m_all[mask] = kd.query_radius( - c[mask], radius[mask], - count_only=True, return_distance=False) + m_all[mask] = _num_points_within_radius(c[mask], radius[mask]) return m_all def _iterate_columns(X, columns=None): """Iterate over columns of a matrix. - Copied from :func:`sklearn.feature_selection._mutual_info._iterate_columns` + Copied from :func:`sklearn.feature_selection._mutual_info` Parameters ---------- @@ -581,4 +611,3 @@ def _iterate_columns(X, columns=None): else: for i in columns: yield X[:, i] - \ No newline at end of file diff --git a/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml b/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml index c3ed08c..b809dc2 100644 --- a/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml +++ b/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml @@ -2,3 +2,8 @@ features: - | Add feature selection method using (conditional) mutual information +deprecations: + - | + Drop support for Python 3.7. + - | + Drop support for scikit-learn 1.2.0 diff --git a/requirements.txt b/requirements.txt index ee3a4a4..c6e0887 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,6 @@ numpy==1.22.0; python_version~="3.10.0" # needed for scipy on windows oldest-supported-numpy; python_version>="3.11.0" scikit-learn==1.2.1 -scipy>=1.7.3 +scipy==1.9.0 reno==3.5.0 diff --git a/setup.cfg b/setup.cfg index b03a6f0..b036d93 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,8 +24,8 @@ install_requires = dwave-system>=1.18.0 numpy>=1.20.0 scikit-learn>=1.2.1 - scipy>=1.7.3 -packages = + scipy>=1.9.0 +packages = dwave dwave.plugins dwave.plugins.sklearn diff --git a/tests/test_transformer.py b/tests/test_transformer.py index abfa25f..cfe4c49 100644 --- a/tests/test_transformer.py +++ b/tests/test_transformer.py @@ -122,14 +122,14 @@ def test_pipeline(self): clf.predict(X) - def test_alpha_0(self): + def test_alpha_0(self): cqm = SelectFromQuadraticModel().correlation_cqm(X=self.X, y=self.y, num_features=3, alpha=0) self.assertTrue(not any(cqm.objective.linear.values())) def test_alpha_1_no_quadratic(self): cqm = SelectFromQuadraticModel().correlation_cqm(X=self.X, y=self.y, num_features=3, alpha=1) self.assertTrue(not any(cqm.objective.quadratic.values())) - + def test_alpha_1(self): rng = np.random.default_rng(42) diff --git a/tests/test_utilities.py b/tests/test_utilities.py index 0ab57c7..364a64b 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -45,31 +45,22 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import os.path import tempfile -import typing import unittest import numpy as np -import numpy.typing as npt from dwave.plugins.sklearn.utilities import ( corrcoef, cov, dot_2d, - _compute_off_diagonal_cmi, - _compute_off_diagonal_mi, + _compute_mi, _compute_cmip_c, _compute_cmip_d, _compute_cmip_cdc, _compute_cmip_ccd ) -from dwave.plugins.sklearn.transformers import estimate_mi_matrix -from scipy.linalg import norm -from scipy.sparse import issparse from scipy.special import digamma -from sklearn.feature_selection._mutual_info import _compute_mi from sklearn.feature_selection import mutual_info_classif, mutual_info_regression from sklearn.neighbors import KDTree from sklearn.preprocessing import scale -from sklearn.utils import check_random_state -from sklearn.utils.validation import check_array, check_X_y + class TestCorrCoef(unittest.TestCase): def test_agreement(self): @@ -198,8 +189,8 @@ def test_memmap(self): class TestMI(unittest.TestCase): - def test_cmi_cdc(self): - np.random.seed(42) + def test_cmi_mixed(self): + np.random.seed(42) n_samples, n_neighbors = 4003, 4 c1 = np.random.randn(n_samples,) n_ss = 1001 @@ -219,7 +210,7 @@ def test_cmi_cdc(self): msg="Computation for mixed continuous/discrete conditional mutual information is not accurate") def test_cmi_symmetry(self): - np.random.seed(42) + np.random.seed(42) n_samples, n_neighbors = 4003, 4 # Test continuous implementation Xy = np.random.randn(n_samples, 3) @@ -240,9 +231,9 @@ def test_cmi_symmetry(self): cmi_ji_pair = _compute_cmip_ccd(c2, c1, d, n_neighbors=n_neighbors) self.assertAlmostEqual( sum(cmi_ij_pair), sum(cmi_ji_pair), places=3, - msg="Computation for mixed continuous/discrete conditional mutual information is not symmetric") + msg="Computation for mixed continuous/discrete conditional mutual information is not symmetric") - def test_cmi(self): + def test_cmi_discrete(self): """ We test the algorithm using the formula `I(x;y|z) = I(x;y) + I(z;y) - I(z;y|x)`. Since the mutual information data is an sklearn function, @@ -250,7 +241,7 @@ def test_cmi(self): it is highly likely that formula is valid only if our algorithm is correct. """ np.random.seed(42) - # Test discrete implementation + # Test discrete implementation n_samples, n_ss, n_neighbors = 103, 51, 4 xi = np.random.randint(0, 11, (n_samples, )) xj = np.hstack(( @@ -273,61 +264,41 @@ def test_cmi(self): msg="Discrete conditional mutual information computation is not symmetric") def test_mi(self): - np.random.seed(42) - # Test discrete implementation - n_samples, n_ss, n_neighbors = 103, 51, 4 + from sklearn.feature_selection._mutual_info import _compute_mi_cd, _compute_mi_cc + from sklearn.metrics.cluster import mutual_info_score + # Test discrete implementation + n_samples, n_ss, n_neighbors, random_state = 103, 51, 4, 15 + np.random.seed(random_state) xi = np.random.randint(0, 11, (n_samples, )) xj = np.hstack(( np.random.randint(-2, 1, (n_ss, )), np.random.randint(10, 14, (n_samples-n_ss, )))) np.random.shuffle(xi) np.random.shuffle(xj) - mi_ij = _compute_off_diagonal_mi( - xi.reshape(-1, 1), xj, True, True, n_neighbors=n_neighbors) - mi_ji = _compute_off_diagonal_mi( - xi.reshape(-1, 1), xj, True, True, n_neighbors=n_neighbors) - mi_ij_cl = mutual_info_classif(xi.reshape(-1, 1), xj, discrete_features=True) + xi_c = scale(np.random.randn(n_samples), with_mean=False) + xj_c = scale(np.random.randn(n_samples), with_mean=False) + + mi_ij = _compute_mi( + xi, xj, True, True, n_neighbors=n_neighbors) + mi_ij_cl = mutual_info_score(xi, xj) self.assertTrue( np.allclose(mi_ij_cl, mi_ij), msg="Discrete mutual information is not computed correctly") - self.assertTrue( - np.allclose(mi_ij_cl, mi_ji), - msg="Discrete mutual information is not computed correctly") - mi_ij = _compute_off_diagonal_mi( - xi, xj, False, True, n_neighbors=n_neighbors) - mi_ji = _compute_off_diagonal_mi( - xi, xj, False, True, n_neighbors=n_neighbors) - mi_ij_cl = mutual_info_classif( - xi.reshape(-1, 1), xj, discrete_features=False) - self.assertTrue( - np.allclose(mi_ij_cl, mi_ij), - msg="Mixed mutual information is not computed correctly") - self.assertTrue( - np.allclose(mi_ij_cl, mi_ji), - msg="Mixed mutual information is not computed correctly") - - xj = np.random.randn(n_samples) - mi_ij = _compute_off_diagonal_mi( - xi, xj, True, False, n_neighbors=n_neighbors) - mi_ij_cl = mutual_info_regression( - xi.reshape(-1, 1), xj, - discrete_features=True, n_neighbors=n_neighbors) + mi_ij = _compute_mi( + xi_c, xj, False, True, n_neighbors=n_neighbors) + mi_ij_cl = _compute_mi_cd(xi_c, xj, n_neighbors=n_neighbors) self.assertTrue( np.allclose(mi_ij_cl, mi_ij, atol=1e-5), - msg="Continuous mutual information is not computed correctly") - - xi = np.random.randn(n_samples) - mi_ij = _compute_off_diagonal_mi( - xi, xj, False, False, n_neighbors=n_neighbors) - mi_ij_cl = mutual_info_regression( - xi.reshape(-1, 1), xj, - discrete_features=False, n_neighbors=n_neighbors) + msg=f"Error in continuous features/discrete target MI estimation is larger than expected {abs(mi_ij_cl-mi_ij)}") + + mi_ij = _compute_mi(xi_c, xj_c, False, False, n_neighbors=n_neighbors) + mi_ij_cl = _compute_mi_cc(xi_c, xj_c, n_neighbors=n_neighbors) self.assertTrue( - np.allclose(mi_ij_cl, mi_ij, atol=1e-3), - msg="Continuous mutual information is not computed correctly") - - def test_implementation(self): + np.allclose(mi_ij_cl, mi_ij, atol=1e-5), + msg=f"Error in purely continuous MI is larger than expected {abs(mi_ij_cl-mi_ij)}") + + def test_cmi_continuous(self): # methods from https://github.com/jannisteunissen/mutual_information def _compute_cmi_t(x, z, y, n_neighbors): n_samples = len(x) @@ -338,20 +309,20 @@ def _compute_cmi_t(x, z, y, n_neighbors): xyz = np.hstack((x, y, z)) k = np.full(n_samples, n_neighbors) radius = get_radius_kneighbors(xyz, n_neighbors) - + mask = (radius == 0) if mask.sum() > 0: vals, ix, counts = np.unique(xyz[mask], axis=0, - return_inverse=True, - return_counts=True) + return_inverse=True, + return_counts=True) k[mask] = counts[ix] - 1 - + nxz = num_points_within_radius(np.hstack((x, z)), radius) nyz = num_points_within_radius(np.hstack((y, z)), radius) nz = num_points_within_radius(z, radius) cmi = max(0, np.mean(digamma(k)) - np.mean(digamma(nxz + 1)) - - np.mean(digamma(nyz + 1)) + np.mean(digamma(nz + 1))) + - np.mean(digamma(nyz + 1)) + np.mean(digamma(nz + 1))) return cmi def get_radius_kneighbors(x, n_neighbors): @@ -381,15 +352,15 @@ def num_points_within_radius(x, radius): kd = KDTree(x, metric="chebyshev") nx = kd.query_radius(x, radius, count_only=True, return_distance=False) return np.array(nx) - 1.0 - + np.random.seed(42) n_samples, n_neighbors = 103, 4 # Test continuous implementation Xy = np.random.randn(n_samples, 3) - cmi_t_01 = _compute_cmi_t(Xy[:,0], Xy[:,1], Xy[:,2], n_neighbors=n_neighbors) - cmi_t_10 = _compute_cmi_t(Xy[:,1], Xy[:,0], Xy[:,2], n_neighbors=n_neighbors) + cmi_t_01 = _compute_cmi_t(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors) + cmi_t_10 = _compute_cmi_t(Xy[:, 1], Xy[:, 0], Xy[:, 2], n_neighbors=n_neighbors) - cmi_ij, cmi_ji = _compute_cmip_c(Xy[:,0], Xy[:,1], Xy[:,2], n_neighbors=n_neighbors) + cmi_ij, cmi_ji = _compute_cmip_c(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors) self.assertAlmostEqual( max(cmi_t_01, 0), cmi_ij, places=5, From db27e38a43c43d89adba341236bc5d1ec26240e5 Mon Sep 17 00:00:00 2001 From: Aivar Sootla Date: Thu, 18 Apr 2024 11:50:14 +0100 Subject: [PATCH 07/12] fix scipy version --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c6e0887..4b995a4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ numpy==1.22.0; python_version~="3.10.0" # needed for scipy on windows oldest-supported-numpy; python_version>="3.11.0" scikit-learn==1.2.1 -scipy==1.9.0 +scipy==1.9.0; python_version~="3.11.0" +scipy==1.9.2; python_version=="3.11.0" reno==3.5.0 From 443b070171c3334d64a568dc3702a46dd307c36d Mon Sep 17 00:00:00 2001 From: Aivar Sootla Date: Thu, 18 Apr 2024 11:58:40 +0100 Subject: [PATCH 08/12] fix requirements bug --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4b995a4..cc26dd2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ numpy==1.22.0; python_version~="3.10.0" # needed for scipy on windows oldest-supported-numpy; python_version>="3.11.0" scikit-learn==1.2.1 -scipy==1.9.0; python_version~="3.11.0" +scipy==1.9.0; python_version<"3.11.0" scipy==1.9.2; python_version=="3.11.0" reno==3.5.0 From b2c3b077fc65e4402a2da065dd3a35ccc2cd74f3 Mon Sep 17 00:00:00 2001 From: Aivar Sootla Date: Fri, 19 Apr 2024 14:56:55 +0000 Subject: [PATCH 09/12] refactor, add new tests, add licence headers --- .circleci/config.yml | 3 - .../sklearn/_conditional_mutual_info.py | 764 ++++++++++++++++++ dwave/plugins/sklearn/transformers.py | 165 +--- dwave/plugins/sklearn/utilities.py | 387 --------- tests/test_mutual_info.py | 369 +++++++++ tests/test_utilities.py | 194 +---- 6 files changed, 1137 insertions(+), 745 deletions(-) create mode 100644 dwave/plugins/sklearn/_conditional_mutual_info.py create mode 100644 tests/test_mutual_info.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 84a4ef1..845014e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -208,9 +208,6 @@ workflows: python-version: &python-versions ["3.8.6", "3.9.0", "3.10.0", "3.11.0"] pip-constraints: - "scikit-learn==1.2.1" # lowest supported by package - # - "scikit-learn~=1.0.0" # 1.0.2 is the highest in ~=1.0.0 - # - "scikit-learn~=1.1.0" - # - "scikit-learn~=1.2.0" # latest in current minor as of March 2023 - "scikit-learn~=1.0" # latest in current major - test-macos: diff --git a/dwave/plugins/sklearn/_conditional_mutual_info.py b/dwave/plugins/sklearn/_conditional_mutual_info.py new file mode 100644 index 0000000..9e5a546 --- /dev/null +++ b/dwave/plugins/sklearn/_conditional_mutual_info.py @@ -0,0 +1,764 @@ +# The following traversal code is adapted from two sources +# I. Scikit-learn implementation for mutual information computation +# II. Computation of conditional mutual information in the repo +# https://github.com/jannisteunissen/mutual_information +# +# I. Methods and algorithms from `sklearn.feature_selection._mutual_info.py` +# +# Author: Nikolay Mayorov +# License: 3-clause BSD +# +# II. Modifications in https://github.com/jannisteunissen/mutual_information +# BSD 3-Clause License +# +# Copyright (c) 2022, Jannis Teunissen +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Modifications are distributed under the Apache 2.0 Software license. +# +# Copyright 2023 D-Wave Systems Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import typing + +import numpy as np +import numpy.typing as npt + +from scipy.sparse import issparse +from scipy.special import digamma +from scipy.stats import entropy +from sklearn.metrics.cluster import mutual_info_score +from sklearn.neighbors import NearestNeighbors, KDTree +from sklearn.preprocessing import scale +from sklearn.utils import check_random_state +from sklearn.utils.parallel import Parallel, delayed +from sklearn.utils.validation import check_array, check_X_y + + +def estimate_mi_matrix( + X: npt.ArrayLike, + y: npt.ArrayLike, + discrete_features: typing.Union[str, npt.ArrayLike] = "auto", + discrete_target: bool = False, + n_neighbors: int = 4, + conditional: bool = True, + copy: bool = True, + random_state: typing.Union[None, int] = None, + n_jobs: typing.Union[None, int] = None, +) -> npt.ArrayLike: + """ + For the feature array `X` and the target array `y` computes + the matrix of (conditional) mutual information interactions. + The matrix is defined as follows: + + `M_(i, i) = I(x_i; y)` + + If `conditional = True`, then the off-diagonal terms are computed: + + `M_(i, j) = (I(x_i; y| x_j) + I(x_j; y| x_i)) / 2` + + Otherwise + + `M_(i, j) = I(x_i; x_j)` + + Computation of I(x; y) uses modified scikit-learn methods. + The computation of I(x; y| z) is based on + https://github.com/jannisteunissen/mutual_information and [3]_. + + The method can be computationally expensive for a large number of features (> 1000) and + a large number of samples (> 100000). In this case, it can be advisable to downsample the + dataset. + + The estimation methods are linear in the number of outcomes (labels) of discrete distributions. + It may be beneficial to treat the discrete distrubitions with a large number of labels as + continuous distributions. + + Args: + X: See :func:`sklearn.feature_selection.mutual_info_regression` + + y: See :func:`sklearn.feature_selection.mutual_info_regression` + + conditional: bool, default=True + Whether to compute the off-diagonal terms using the conditional mutual + information or joint mutual information + + discrete_features: See :func:`sklearn.feature_selection.mutual_info_regression` + + discrete_target: bool, default=False + Whether the target variable `y` is discrete + + n_neighbors: See :func:`sklearn.feature_selection.mutual_info_regression` + + copy: See :func:`sklearn.feature_selection.mutual_info_regression` + + random_state: See :func:`sklearn.feature_selection.mutual_info_regression` + + n_jobs: int, default=None + The number of parallel jobs to run for the conditional mutual information + computation. The parallelization is over the columns of `X`. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Returns: + mi_matrix : ndarray, shape (n_features, n_features) + Interaction matrix between the features using (conditional) mutual information. + A negative value will be replaced by 0. + + References: + .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual + information". Phys. Rev. E 69, 2004. + .. [2] B. C. Ross "Mutual Information between Discrete and Continuous + Data Sets". PLoS ONE 9(2), 2014. + .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional + mutual information estimation for mixed, discrete and continuous + data." IEEE Transactions on Information Theory 67.1 (2020): 464-484. + """ + X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target) + n_samples, n_features = X.shape + + if isinstance(discrete_features, (str, bool)): + if isinstance(discrete_features, str): + if discrete_features == "auto": + discrete_features = issparse(X) + else: + raise ValueError("Invalid string value for discrete_features.") + discrete_mask = np.empty(n_features, dtype=bool) + discrete_mask.fill(discrete_features) + else: + discrete_features = check_array(discrete_features, ensure_2d=False) + if np.issubdtype(discrete_features.dtype, bool): + discrete_mask = np.zeros(n_features, dtype=bool) + discrete_mask[discrete_features] = True + else: + discrete_mask = discrete_features + + continuous_mask = ~discrete_mask + if np.any(continuous_mask) and issparse(X): + raise ValueError("Sparse matrix `X` can't have continuous features.") + + rng = check_random_state(random_state) + if np.any(continuous_mask): + X = X.astype(np.float64, copy=copy) + X[:, continuous_mask] = scale( + X[:, continuous_mask], with_mean=False, copy=False + ) + + # Add small noise to continuous features as advised in Kraskov et. al. + means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0)) + X[:, continuous_mask] += ( + 1e-10 + * means + * rng.standard_normal(size=(n_samples, np.sum(continuous_mask))) + ) + + if not discrete_target: + y = scale(y, with_mean=False) + y += ( + 1e-10 + * np.maximum(1, np.mean(np.abs(y))) + * rng.standard_normal(size=n_samples) + ) + + mi_matrix = np.zeros((n_features, n_features), dtype=np.float64) + # Computing the diagonal terms + diagonal_vals = Parallel(n_jobs=n_jobs)( + delayed(_compute_mi)(x, y, discrete_feature, discrete_target, n_neighbors) + for x, discrete_feature in zip(_iterate_columns(X), discrete_mask) + ) + np.fill_diagonal(mi_matrix, diagonal_vals) + # Computing the off-diagonal terms + off_diagonal_iter = combinations(zip(_iterate_columns(X), discrete_mask), 2) + if conditional: + off_diagonal_vals = Parallel(n_jobs=n_jobs)( + delayed(_compute_cmi_distance) + (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors) + for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter) + else: + off_diagonal_vals = Parallel(n_jobs=n_jobs)( + delayed(_compute_mi) + (xi, xj, discrete_feature_i, discrete_feature_j, n_neighbors) + for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter) + + off_diagonal_val = iter(off_diagonal_vals) + for i, j in combinations(range(n_features), 2): + mi_matrix[i, j] = mi_matrix[j, i] = next(off_diagonal_val) + return mi_matrix + + +def _compute_cmi_distance( + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike, + discrete_feature_i: bool, + discrete_feature_j: bool, + discrete_target: bool, + n_neighbors: int = 4, +) -> float: + """ + Computes a distance `d` based on the conditional mutual information + between features `x_i` and `x_j`: `d = (I(x_i; y | x_j)+I(x_j; y | x_i))/2`. + Args: + xi: + A feature vector formatted as a numerical 1D array-like. + xj: + A feature vector formatted as a numerical 1D array-like. + y: + Target vector formatted as a numerical 1D array-like. + discrete_feature_i: + Whether to consider `xi` as a discrete variable. + discrete_feature_j: + Whether to consider `xj` as a discrete variable. + discrete_target: + Whether to consider `y` as a discrete variable. + n_neighbors: + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + Returns: + Distance between features based on conditional mutual information. + References + ---------- + .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual + information". Phys. Rev. E 69, 2004. + .. [2] B. C. Ross "Mutual Information between Discrete and Continuous + Data Sets". PLoS ONE 9(2), 2014. + .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional + mutual information estimation for mixed, discrete and continuous + data." IEEE Transactions on Information Theory 67.1 (2020): 464-484. + """ + if discrete_feature_i and discrete_feature_j and discrete_target: + ans = _compute_cmip_d(xi, xj, y) + elif discrete_target: + ans = _compute_cmip_ccd(xi, xj, y, n_neighbors) + elif discrete_feature_i and (not discrete_target): + ans = _compute_cmip_cdc(xj, xi, y, n_neighbors) + elif (not discrete_feature_i) and discrete_feature_j and (not discrete_target): + ans = _compute_cmip_cdc(xi, xj, y, n_neighbors) + else: + ans = _compute_cmip_c(xi, xj, y, n_neighbors) + return np.mean(ans) + + +def _compute_cmip_d( + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike) -> typing.Tuple[float]: + """ + Computes conditional mutual information pair `I(x_i; y | x_j)` and `I(x_j; y | x_i)`. + All random variables `x_i`, `x_j` and `y` are discrete. + + Adapted from https://github.com/dwave-examples/mutual-information-feature-selection + Args: + xi: + A feature vector formatted as a numerical 1D array-like. + xj: + A feature vector formatted as a numerical 1D array-like. + y: + Target vector formatted as a numerical 1D array-like. + n_neighbors: + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + Returns: + Pair of conditional mutual information values between two features and the target vector. + """ + # Computing joint probability distribution for the features and the target + bin_boundaries = [np.hstack((np.unique(col), np.inf)) for col in (xi, xj, y)] + prob, _ = np.histogramdd(np.vstack((xi, xj, y)).T, bins=bin_boundaries) + + # Computing entropy + Hijy = entropy(prob.flatten()) # H(x_i, x_j, y) + Hij = entropy(prob.sum(axis=2).flatten()) # H(x_i, x_j) + cmi_ij = ( + Hij-Hijy + + entropy(prob.sum(axis=0).flatten()) # H(x_j, y) + - entropy(prob.sum(axis=(0, 2))) # H(x_j) + ) + cmi_ji = ( + Hij-Hijy + + entropy(prob.sum(axis=1).flatten()) # H(x_i, y) + - entropy(prob.sum(axis=(1, 2))) # H(x_i) + ) + return cmi_ij, cmi_ji + + +def _compute_cmip_c( + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike, + n_neighbors: int = 4) -> typing.Tuple[float]: + """ + Computes conditional mutual information pair `I(x_i; y | x_j)` + and `I(x_j; y | x_i)`. All random variables `x_i`, `x_j` and `y` + are assumed to be continuous. + + Adapted from https://github.com/jannisteunissen/mutual_information + :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + + Args: + xi: + A feature vector formatted as a numerical 1D array-like. + xj: + A feature vector formatted as a numerical 1D array-like. + y: + Target vector formatted as a numerical 1D array-like. + n_neighbors: + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + Returns: + Pair of conditional mutual information values between two features and the target vector. + """ + xi = xi.reshape((-1, 1)) + xj = xj.reshape((-1, 1)) + y = y.reshape((-1, 1)) + + # Here we rely on NearestNeighbors to select the fastest algorithm. + radius = _get_radius_k_neighbours(np.hstack((xi, xj, y)), + n_neighbors=n_neighbors) + + # KDTree is explicitly fit to allow for the querying of number of + # neighbors within a specified radius + n_j = _num_points_within_radius(xj, radius) + n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius) + n_jy = _num_points_within_radius(np.hstack((y, xj)), radius) + + n_i = _num_points_within_radius(xi, radius) + n_iy = _num_points_within_radius(np.hstack((y, xi)), radius) + + return _get_cmi_pair_from_estimates([n_neighbors], n_ij, n_j, n_jy, n_i, n_iy) + + +def _compute_cmip_ccd( + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike, + n_neighbors: int = 4) -> typing.Tuple[float]: + """ + Computes conditional mutual information pair `I(x_i; y | x_j)` + and `I(x_j; y | x_i)`. Random variables `x_i`, `x_j` are assumed + to be continuous, while `y` is assumed to be discrete. + + Adapted from https://github.com/jannisteunissen/mutual_information + :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + Args: + xi: + A feature vector formatted as a numerical 1D array-like. + xj: + A feature vector formatted as a numerical 1D array-like. + y: + Target vector formatted as a numerical 1D array-like. + n_neighbors: + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + Returns: + Pair of conditional mutual information values between two features and the target vector. + """ + xi = xi.reshape((-1, 1)) + xj = xj.reshape((-1, 1)) + + # Here we rely on NearestNeighbors to select the fastest algorithm. + radius, label_counts, k_all = _get_radius_k_neighbours_d(np.hstack((xi, xj)), y, + n_neighbors=n_neighbors) + + # Ignore points with unique labels. + mask = label_counts > 1 + # label_counts = label_counts[mask] + k_all = k_all[mask] + xi = xi[mask] + xj = xj[mask] + y = y[mask] + radius = radius[mask] + # KDTree is explicitly fit to allow for the querying of number of + # neighbors within a specified radius + # continuous + n_i = _num_points_within_radius(xi, radius) + n_j = _num_points_within_radius(xj, radius) + n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius) + # mixed discrete/continuous estimates + n_iy = _num_points_within_radius_cd(xi, y, radius) + n_jy = _num_points_within_radius_cd(xj, y, radius) + + return _get_cmi_pair_from_estimates(k_all, n_ij, n_j, n_jy, n_i, n_iy) + + +def _compute_cmip_cdc( + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike, + n_neighbors: int = 4) -> typing.Tuple[float]: + """ + Computes conditional mutual information pair `I(x_i; y | x_j)` + and `I(x_j; y | x_i)`. Random variables `x_i`, `y` are assumed + to be continuous, while `x_j` is assumed to be discrete. + + Adapted from https://github.com/jannisteunissen/mutual_information + :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + Args: + xi: + A feature vector formatted as a numerical 1D array-like. + xj: + A feature vector formatted as a numerical 1D array-like. + y: + Target vector formatted as a numerical 1D array-like. + n_neighbors: + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + Returns: + Pair of conditional mutual information values between two features and the target vector. + """ + xi = xi.reshape((-1, 1)) + y = y.reshape((-1, 1)) + + # Here we rely on NearestNeighbors to select the fastest algorithm. + radius, label_counts, k_all = _get_radius_k_neighbours_d(np.hstack((xi, y)), xj, + n_neighbors=n_neighbors) + + # Ignore points with unique labels. + mask = label_counts > 1 + label_counts = label_counts[mask] + k_all = k_all[mask] + xi = xi[mask] + xj = xj[mask] + y = y[mask] + radius = radius[mask] + + # KDTree is explicitly fit to allow for the querying of number of + # neighbors within a specified radius + # continuous + n_i = _num_points_within_radius(xi, radius) + n_iy = _num_points_within_radius(np.hstack((xi, y)), radius) + # mixed coninuous/discrete estimates + n_ij = _num_points_within_radius_cd(xi, xj, radius) + n_jy = _num_points_within_radius_cd(y, xj, radius) + # discrete estimates + n_j = label_counts + + return _get_cmi_pair_from_estimates(k_all, n_ij, n_j, n_jy, n_i, n_iy) + + +def _get_cmi_pair_from_estimates( + k_all: npt.ArrayLike, n_ij: npt.ArrayLike, n_j: npt.ArrayLike, + n_jy: npt.ArrayLike, n_i: npt.ArrayLike, n_iy: npt.ArrayLike) -> typing.Tuple[float]: + """Get an estimate from nearest neighbors counts""" + common_terms = np.mean(digamma(k_all))-np.mean(digamma(n_ij)) + cmi_ij = common_terms+np.mean(digamma(n_j))-np.mean(digamma(n_jy)) + cmi_ji = common_terms+np.mean(digamma(n_i))-np.mean(digamma(n_iy)) + return max(0, cmi_ij), max(0, cmi_ji) + + +def _compute_mi( + x: npt.ArrayLike, + y: npt.ArrayLike, + discrete_feature_x: bool, + discrete_feature_y: bool, + n_neighbors: int = 4, +): + """ + Compute mutual information between features `I(x_i; x_j)`. + + Adapted from :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + + Args: + x, y: + Feature vectors each formatted as a numerical 1D array-like. + discrete_feature_x, discrete_feature_y: + Whether to consider `x` and `y` as discrete variables. + n_neighbors: + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + Returns: + Mutual information between feature vectors x and y. + References + ---------- + .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual + information". Phys. Rev. E 69, 2004. + .. [2] B. C. Ross "Mutual Information between Discrete and Continuous + Data Sets". PLoS ONE 9(2), 2014. + """ + if discrete_feature_y and discrete_feature_x: + return mutual_info_score(x, y) + elif discrete_feature_x: + return _compute_mi_cd(y, x, n_neighbors=n_neighbors) + elif discrete_feature_y: + return _compute_mi_cd(x, y, n_neighbors=n_neighbors) + else: + return _compute_mi_cc(x, y, n_neighbors=n_neighbors) + + +def _compute_mi_cc(x: npt.ArrayLike, y: npt.ArrayLike, n_neighbors: int) -> float: + """Computes mutual information `I(x; y)`. Random variables `x`, `y` are assumed + to be continuous. + + Adapted from :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + Args: + x, y: + Feature vectors each formatted as a numerical 1D array-like. + n_neighbors: + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + Returns: + Mutual information between feature vectors x and y. + """ + n_samples = x.size + + x = x.reshape((-1, 1)) + y = y.reshape((-1, 1)) + + # Here we rely on NearestNeighbors to select the fastest algorithm. + radius = _get_radius_k_neighbours(np.hstack((x, y)), n_neighbors) + + # KDTree is explicitly fit to allow for the querying of number of + # neighbors within a specified radius + nx = _num_points_within_radius(x, radius) + ny = _num_points_within_radius(y, radius) + + mi = ( + digamma(n_samples) + + digamma(n_neighbors) + - np.mean(digamma(nx)) + - np.mean(digamma(ny)) + ) + + return max(0, mi) + + +def _compute_mi_cd(x: npt.ArrayLike, y: npt.ArrayLike, n_neighbors: int) -> float: + """Computes mutual information `I(x; y)`. Random variable `x`, is assumed + to be continuous, while `y` is assumed to be discrete. + + Adapted from :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + Args: + x, y: + Feature vectors each formatted as a numerical 1D array-like. + n_neighbors: + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + Returns: + Mutual information between feature vectors x and y. + """ + radius, label_counts, k_all = _get_radius_k_neighbours_d(x, y, n_neighbors=n_neighbors) + + # Ignore points with unique labels. + mask = label_counts > 1 + n_samples = np.sum(mask) + label_counts = label_counts[mask] + k_all = k_all[mask] + x = x[mask] + radius = radius[mask] + + m_all = _num_points_within_radius(x.reshape(-1, 1), radius) + + mi = ( + digamma(n_samples) + + np.mean(digamma(k_all)) + - np.mean(digamma(label_counts)) + - np.mean(digamma(m_all)) + ) + + return max(0, mi) + + +def _get_radius_k_neighbours(X: npt.ArrayLike, n_neighbors: int = 4) -> npt.ArrayLike: + """ + Determine the smallest radius around `X` containing `n_neighbors` neighbors + Inspired by :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + + Args: + X: + Array of features. + n_neighbors: + Number of nearest neighbors. + Returns: + Vector of radii defined by the k nearest neighbors. + """ + nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors) + return _get_radius_from_nn(X, nn) + + +def _get_radius_k_neighbours_d( + c: npt.ArrayLike, + d: npt.ArrayLike, + n_neighbors: int = 4 +) -> typing.Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]: + """ + Determine smallest radius around `c` and `d` containing `n_neighbors`. + + Inspired by :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + Args: + c: + Array of feature vectors. + d: + Vector of discrete features formatted as a numerical 1D array-like. + n_neighbors: + The number of nearest neighbors. + Returns: + radius: + Vector of radii defined by the k nearest neighbors. + label_counts: + Label counts of the discrete feature vector. + k_all: + Array of the number of points within a radius. + """ + d = d.flatten() + n_samples = d.shape[0] + c = c.reshape((n_samples, -1)) + + radius = np.empty(n_samples) + label_counts = np.empty(n_samples) + k_all = np.empty(n_samples) + nn = NearestNeighbors(metric="chebyshev") + for label in np.unique(d): + mask = d == label + count = np.sum(mask) + if count > 1: + k = min(n_neighbors, count - 1) + nn.set_params(n_neighbors=k) + radius[mask] = _get_radius_from_nn(c[mask], nn) + k_all[mask] = k + label_counts[mask] = np.sum(mask) + return radius, label_counts, k_all + + +def _get_radius_from_nn(X: npt.ArrayLike, nn: NearestNeighbors) -> npt.ArrayLike: + """ + Get the radius of nearest neighbors in `nn` model with dataset `X`. + + Inspired by :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + Args: + X: + Array of features. + nn: + Instance of the nearest neighbors class. + Returns: + Vector of radii defined by the k nearest neighbors. + """ + nn.fit(X) + r = nn.kneighbors()[0] + return np.nextafter(r[:, -1], 0) + + +def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike) -> npt.ArrayLike: + """For each point, determine the number of other points within a given radius + Function from https://github.com/jannisteunissen/mutual_information + Args: + X: + (Continuous) feature array. + radius: + Vector of radii. + Returns: + Vector containing the number of points within a radius. + """ + kd = KDTree(x, metric="chebyshev") + nx = kd.query_radius(x, radius, count_only=True, return_distance=False) + return np.array(nx) + + +def _num_points_within_radius_cd( + c: npt.ArrayLike, + d: npt.ArrayLike, + radius: npt.ArrayLike) -> npt.ArrayLike: + """ + For each point, determine the number of other points within a given radius. + + Inspired by :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + Args: + c: + (Continuous) feature array. + d: + Discrete feature vector formatted as a numerical 1D array-like + radius: + Vector of radii. + Returns: + Vector containing the number of points within a radius. + """ + c = c.reshape((-1, 1)) + n_samples = c.shape[0] + m_all = np.empty(n_samples) + for label in np.unique(d): + mask = d == label + m_all[mask] = _num_points_within_radius(c[mask], radius[mask]) + return m_all + + +def _iterate_columns(X, columns=None): + """Iterate over columns of a matrix. + Copied from :func:`sklearn.feature_selection._mutual_info`. + + Parameters + ---------- + X : ndarray or csc_matrix, shape (n_samples, n_features) + Matrix over which to iterate. + + columns : iterable or None, default=None + Indices of columns to iterate over. If None, iterate over all columns. + + Yields + ------ + x : ndarray, shape (n_samples,) + Columns of `X` in dense format. + """ + if columns is None: + columns = range(X.shape[1]) + + if issparse(X): + for i in columns: + x = np.zeros(X.shape[0]) + start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1] + x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr] + yield x + else: + for i in columns: + yield X[:, i] diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py index 01275d0..db444ee 100644 --- a/dwave/plugins/sklearn/transformers.py +++ b/dwave/plugins/sklearn/transformers.py @@ -20,22 +20,14 @@ import dimod from dwave.cloud.exceptions import ConfigFileError, SolverAuthenticationError -from dwave.plugins.sklearn.utilities import ( - corrcoef, - _compute_cmi_distance, - _compute_mi, - _iterate_columns) +from dwave.plugins.sklearn._conditional_mutual_info import estimate_mi_matrix +from dwave.plugins.sklearn.utilities import corrcoef from dwave.system import LeapHybridCQMSampler import numpy as np import numpy.typing as npt -from scipy.sparse import issparse from sklearn.base import BaseEstimator from sklearn.feature_selection import SelectorMixin -from sklearn.feature_selection import mutual_info_classif, mutual_info_regression -from sklearn.preprocessing import scale -from sklearn.utils import check_random_state -from sklearn.utils.parallel import Parallel, delayed -from sklearn.utils.validation import check_is_fitted, check_array, check_X_y +from sklearn.utils.validation import check_is_fitted __all__ = ["SelectFromQuadraticModel"] @@ -450,154 +442,3 @@ def _check_params(X: npt.ArrayLike, if X.shape[0] <= 1: raise ValueError("X must have at least two rows") - - -def estimate_mi_matrix( - X: npt.ArrayLike, - y: npt.ArrayLike, - discrete_features: typing.Union[str, npt.ArrayLike] = "auto", - discrete_target: bool = False, - n_neighbors: int = 4, - conditional: bool = True, - copy: bool = True, - random_state: typing.Union[None, int] = None, - n_jobs: typing.Union[None, int] = None, -) -> npt.ArrayLike: - """ - For the feature array `X` and the target array `y` computes - the matrix of (conditional) mutual information interactions. - The matrix is defined as follows: - - `M_(i, i) = I(x_i; y)` - - If `conditional = True`, then the off-diagonal terms are computed: - - `M_(i, j) = (I(x_i; y| x_j) + I(x_j; y| x_i)) / 2` - - Otherwise - - `M_(i, j) = I(x_i; x_j)` - - Computation of I(x; y) uses modified scikit-learn methods. - The computation of I(x; y| z) is based on - https://github.com/jannisteunissen/mutual_information and [3]_. - - The method can be computationally expensive for a large number of features (> 1000) and - a large number of samples (> 100000). In this case, it can be advisable to downsample the - dataset. - - The estimation methods are linear in the number of outcomes (labels) of discrete distributions. - It may be beneficial to treat the discrete distrubitions with a large number of labels as - continuous distributions. - - Args: - X: See :func:`sklearn.feature_selection.mutual_info_regression` - - y: See :func:`sklearn.feature_selection.mutual_info_regression` - - conditional: bool, default=True - Whether to compute the off-diagonal terms using the conditional mutual - information or joint mutual information - - discrete_features: See :func:`sklearn.feature_selection.mutual_info_regression` - - discrete_target: bool, default=False - Whether the target variable `y` is discrete - - n_neighbors: See :func:`sklearn.feature_selection.mutual_info_regression` - - copy: See :func:`sklearn.feature_selection.mutual_info_regression` - - random_state: See :func:`sklearn.feature_selection.mutual_info_regression` - - n_jobs: int, default=None - The number of parallel jobs to run for the conditional mutual information - computation. The parallelization is over the columns of `X`. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. - - Returns: - mi_matrix : ndarray, shape (n_features, n_features) - Interaction matrix between the features using (conditional) mutual information. - A negative value will be replaced by 0. - - References: - .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual - information". Phys. Rev. E 69, 2004. - .. [2] B. C. Ross "Mutual Information between Discrete and Continuous - Data Sets". PLoS ONE 9(2), 2014. - .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional - mutual information estimation for mixed, discrete and continuous - data." IEEE Transactions on Information Theory 67.1 (2020): 464-484. - """ - X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target) - n_samples, n_features = X.shape - - if isinstance(discrete_features, (str, bool)): - if isinstance(discrete_features, str): - if discrete_features == "auto": - discrete_features = issparse(X) - else: - raise ValueError("Invalid string value for discrete_features.") - discrete_mask = np.empty(n_features, dtype=bool) - discrete_mask.fill(discrete_features) - else: - discrete_features = check_array(discrete_features, ensure_2d=False) - if np.issubdtype(discrete_features.dtype, bool): - discrete_mask = np.zeros(n_features, dtype=bool) - discrete_mask[discrete_features] = True - else: - discrete_mask = discrete_features - - continuous_mask = ~discrete_mask - if np.any(continuous_mask) and issparse(X): - raise ValueError("Sparse matrix `X` can't have continuous features.") - - rng = check_random_state(random_state) - if np.any(continuous_mask): - X = X.astype(np.float64, copy=copy) - X[:, continuous_mask] = scale( - X[:, continuous_mask], with_mean=False, copy=False - ) - - # Add small noise to continuous features as advised in Kraskov et. al. - means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0)) - X[:, continuous_mask] += ( - 1e-10 - * means - * rng.standard_normal(size=(n_samples, np.sum(continuous_mask))) - ) - - if not discrete_target: - y = scale(y, with_mean=False) - y += ( - 1e-10 - * np.maximum(1, np.mean(np.abs(y))) - * rng.standard_normal(size=n_samples) - ) - - mi_matrix = np.zeros((n_features, n_features), dtype=np.float64) - # Computing the diagonal terms - diagonal_vals = Parallel(n_jobs=n_jobs)( - delayed(_compute_mi)(x, y, discrete_feature, discrete_target, n_neighbors) - for x, discrete_feature in zip(_iterate_columns(X), discrete_mask) - ) - np.fill_diagonal(mi_matrix, diagonal_vals) - # Computing the off-diagonal terms - off_diagonal_iter = combinations(zip(_iterate_columns(X), discrete_mask), 2) - if conditional: - off_diagonal_vals = Parallel(n_jobs=n_jobs)( - delayed(_compute_cmi_distance) - (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors) - for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter) - else: - off_diagonal_vals = Parallel(n_jobs=n_jobs)( - delayed(_compute_mi) - (xi, xj, discrete_feature_i, discrete_feature_j, n_neighbors) - for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter) - - off_diagonal_val = iter(off_diagonal_vals) - for i, j in combinations(range(n_features), 2): - mi_matrix[i, j] = mi_matrix[j, i] = next(off_diagonal_val) - return mi_matrix diff --git a/dwave/plugins/sklearn/utilities.py b/dwave/plugins/sklearn/utilities.py index bbdf956..caccfcd 100644 --- a/dwave/plugins/sklearn/utilities.py +++ b/dwave/plugins/sklearn/utilities.py @@ -51,11 +51,6 @@ import numpy as np import numpy.typing as npt -from scipy.sparse import issparse -from scipy.special import digamma -from scipy.stats import entropy -from sklearn.metrics.cluster import mutual_info_score -from sklearn.neighbors import NearestNeighbors, KDTree __all__ = ["corrcoef", "cov", "dot_2d"] @@ -229,385 +224,3 @@ def dot_2d(a: npt.ArrayLike, b: npt.ArrayLike, *, out.flush() return out - - -def _compute_cmi_distance( - xi: npt.ArrayLike, - xj: npt.ArrayLike, - y: npt.ArrayLike, - discrete_feature_i: bool, - discrete_feature_j: bool, - discrete_target: bool, - n_neighbors: int = 4, -): - """ - Computing a distance `d` based on the conditional mutual infomation - between features `x_i` and `x_j`: `d = (I(x_i; y | x_j)+I(x_j; y | x_i))/2`. - - References - ---------- - .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual - information". Phys. Rev. E 69, 2004. - .. [2] B. C. Ross "Mutual Information between Discrete and Continuous - Data Sets". PLoS ONE 9(2), 2014. - .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional - mutual information estimation for mixed, discrete and continuous - data." IEEE Transactions on Information Theory 67.1 (2020): 464-484. - """ - if discrete_feature_i and discrete_feature_j and discrete_target: - ans = _compute_cmip_d(xi, xj, y) - elif discrete_target: - ans = _compute_cmip_ccd(xi, xj, y, n_neighbors) - elif discrete_feature_i and (not discrete_target): - ans = _compute_cmip_cdc(xj, xi, y, n_neighbors) - elif (not discrete_feature_i) and discrete_feature_j and (not discrete_target): - ans = _compute_cmip_cdc(xi, xj, y, n_neighbors) - else: - ans = _compute_cmip_c(xi, xj, y, n_neighbors) - return np.mean(ans) - - -def _compute_cmip_d( - xi: npt.ArrayLike, - xj: npt.ArrayLike, - y: npt.ArrayLike) -> typing.Tuple[float]: - """ - Computes conditional mutual infomation pair `I(x_i; y | x_j)` and `I(x_j; y | x_i)`. - All random variables `x_i`, `x_j` and `y` are discrete. - - Adpated from https://github.com/dwave-examples/mutual-information-feature-selection - """ - # Computing joint probability distribution for the features and the target - bin_boundaries = [np.hstack((np.unique(col), np.inf)) for col in (xi, xj, y)] - prob, _ = np.histogramdd(np.vstack((xi, xj, y)).T, bins=bin_boundaries) - - # Computing entropy - Hijy = entropy(prob.flatten()) # H(x_i, x_j, y) - Hij = entropy(prob.sum(axis=2).flatten()) # H(x_i, x_j) - cmi_ij = ( - Hij-Hijy - + entropy(prob.sum(axis=0).flatten()) # H(x_j, y) - - entropy(prob.sum(axis=(0, 2))) # H(x_j) - ) - cmi_ji = ( - Hij-Hijy - + entropy(prob.sum(axis=1).flatten()) # H(x_i, y) - - entropy(prob.sum(axis=(1, 2))) # H(x_i) - ) - return cmi_ij, cmi_ji - - -def _compute_cmip_c( - xi: npt.ArrayLike, - xj: npt.ArrayLike, - y: npt.ArrayLike, - n_neighbors: int = 4) -> typing.Tuple[float]: - """ - Computes conditional mutual information pair `I(x_i; y | x_j)` - and `I(x_j; y | x_i)`. All random variables `x_i`, `x_j` and `y` - are assumed to be continuous. - - Adapted from https://github.com/jannisteunissen/mutual_information - """ - xi = xi.reshape((-1, 1)) - xj = xj.reshape((-1, 1)) - y = y.reshape((-1, 1)) - - # Here we rely on NearestNeighbors to select the fastest algorithm. - radius = _get_radius_k_neighbours(np.hstack((xi, xj, y)), - n_neighbors=n_neighbors) - - # KDTree is explicitly fit to allow for the querying of number of - # neighbors within a specified radius - n_j = _num_points_within_radius(xj, radius) - n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius) - n_jy = _num_points_within_radius(np.hstack((y, xj)), radius) - - n_i = _num_points_within_radius(xi, radius) - n_iy = _num_points_within_radius(np.hstack((y, xi)), radius) - - return _get_cmi_pair_from_estimates([n_neighbors], n_ij, n_j, n_jy, n_i, n_iy) - - -def _compute_cmip_ccd( - xi: npt.ArrayLike, - xj: npt.ArrayLike, - y: npt.ArrayLike, - n_neighbors: int = 4) -> typing.Tuple[float]: - """ - Computes conditional mutual information pair `I(x_i; y | x_j)` - and `I(x_j; y | x_i)`. Random variables `x_i`, `x_j` are assumed - to be continuous, while `y` is assumed to be discrete. - - Adapted from https://github.com/jannisteunissen/mutual_information and - :func:`sklearn.feature_selection.mutual_info_regression` - """ - xi = xi.reshape((-1, 1)) - xj = xj.reshape((-1, 1)) - - # Here we rely on NearestNeighbors to select the fastest algorithm. - radius, label_counts, k_all = _get_radius_k_neighbours_d(np.hstack((xi, xj)), y, - n_neighbors=n_neighbors) - - # Ignore points with unique labels. - mask = label_counts > 1 - label_counts = label_counts[mask] - k_all = k_all[mask] - xi = xi[mask] - xj = xj[mask] - y = y[mask] - radius = radius[mask] - # KDTree is explicitly fit to allow for the querying of number of - # neighbors within a specified radius - # continuous - n_i = _num_points_within_radius(xi, radius) - n_j = _num_points_within_radius(xj, radius) - n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius) - # mixed discrete/continuous estimates - n_iy = _num_points_within_radius_cd(xi, y, radius) - n_jy = _num_points_within_radius_cd(xj, y, radius) - - return _get_cmi_pair_from_estimates(k_all, n_ij, n_j, n_jy, n_i, n_iy) - - -def _compute_cmip_cdc( - xi: npt.ArrayLike, - xj: npt.ArrayLike, - y: npt.ArrayLike, - n_neighbors: int = 4) -> typing.Tuple[float]: - """ - Computes conditional mutual information pair `I(x_i; y | x_j)` - and `I(x_j; y | x_i)`. Random variables `x_i`, `y` are assumed - to be continuous, while `x_j` is assumed to be discrete. - - Adapted from https://github.com/jannisteunissen/mutual_information and - :func:`sklearn.feature_selection.mutual_info_regression` - """ - xi = xi.reshape((-1, 1)) - y = y.reshape((-1, 1)) - - # Here we rely on NearestNeighbors to select the fastest algorithm. - radius, label_counts, k_all = _get_radius_k_neighbours_d(np.hstack((xi, y)), xj, - n_neighbors=n_neighbors) - - # Ignore points with unique labels. - mask = label_counts > 1 - label_counts = label_counts[mask] - k_all = k_all[mask] - xi = xi[mask] - xj = xj[mask] - y = y[mask] - radius = radius[mask] - - # KDTree is explicitly fit to allow for the querying of number of - # neighbors within a specified radius - # continuous - n_i = _num_points_within_radius(xi, radius) - n_iy = _num_points_within_radius(np.hstack((xi, y)), radius) - # mixed coninuous/discrete estimates - n_ij = _num_points_within_radius_cd(xi, xj, radius) - n_jy = _num_points_within_radius_cd(y, xj, radius) - # discrete estimates - n_j = label_counts - - return _get_cmi_pair_from_estimates(k_all, n_ij, n_j, n_jy, n_i, n_iy) - - -def _get_cmi_pair_from_estimates( - k_all: npt.ArrayLike, n_ij: npt.ArrayLike, n_j: npt.ArrayLike, - n_jy: npt.ArrayLike, n_i: npt.ArrayLike, n_iy: npt.ArrayLike) -> typing.Tuple[float]: - - common_terms = np.mean(digamma(k_all))-np.mean(digamma(n_ij)) - cmi_ij = common_terms+np.mean(digamma(n_j))-np.mean(digamma(n_jy)) - cmi_ji = common_terms+np.mean(digamma(n_i))-np.mean(digamma(n_iy)) - return max(0, cmi_ij), max(0, cmi_ji) - - -def _compute_mi( - x: npt.ArrayLike, - y: npt.ArrayLike, - discrete_feature_x: bool, - discrete_feature_y: bool, - n_neighbors: int = 4, -): - """ - Compute mutual information between features `I(x_i; x_j)`. - - References - ---------- - .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual - information". Phys. Rev. E 69, 2004. - .. [2] B. C. Ross "Mutual Information between Discrete and Continuous - Data Sets". PLoS ONE 9(2), 2014. - """ - if discrete_feature_y and discrete_feature_x: - return mutual_info_score(x, y) - elif discrete_feature_x: - return _compute_mi_cd(y, x, n_neighbors=n_neighbors) - elif discrete_feature_y: - return _compute_mi_cd(x, y, n_neighbors=n_neighbors) - else: - return _compute_mi_cc(x, y, n_neighbors=n_neighbors) - - -def _compute_mi_cc(x: npt.ArrayLike, y: npt.ArrayLike, n_neighbors: int) -> float: - """Computes mutual information `I(x; y)`. Random variables `x`, `y` are assumed - to be continuous. - - Adapted from :func:`sklearn.feature_selection.mutual_info_regression` - """ - n_samples = x.size - - x = x.reshape((-1, 1)) - y = y.reshape((-1, 1)) - - # Here we rely on NearestNeighbors to select the fastest algorithm. - radius = _get_radius_k_neighbours(np.hstack((x, y)), n_neighbors) - - # KDTree is explicitly fit to allow for the querying of number of - # neighbors within a specified radius - nx = _num_points_within_radius(x, radius) - ny = _num_points_within_radius(y, radius) - - mi = ( - digamma(n_samples) - + digamma(n_neighbors) - - np.mean(digamma(nx)) - - np.mean(digamma(ny)) - ) - - return max(0, mi) - - -def _compute_mi_cd(x: npt.ArrayLike, y: npt.ArrayLike, n_neighbors: int) -> float: - """Computes mutual information `I(x; y)`. Random variable `x`, is assumed - to be continuous, while `y` is assumed to be discrete. - - Adapted from :func:`sklearn.feature_selection.mutual_info_regression` - """ - radius, label_counts, k_all = _get_radius_k_neighbours_d(x, y, n_neighbors=n_neighbors) - - # Ignore points with unique labels. - mask = label_counts > 1 - n_samples = np.sum(mask) - label_counts = label_counts[mask] - k_all = k_all[mask] - x = x[mask] - radius = radius[mask] - - m_all = _num_points_within_radius(x.reshape(-1, 1), radius) - - mi = ( - digamma(n_samples) - + np.mean(digamma(k_all)) - - np.mean(digamma(label_counts)) - - np.mean(digamma(m_all)) - ) - - return max(0, mi) - - -def _get_radius_k_neighbours(X: npt.ArrayLike, n_neighbors: int = 4) -> npt.ArrayLike: - """ - Determine smallest radius around `X` containing `n_neighbors` neighbors - Inspired by :func:`sklearn.feature_selection.mutual_info_regression` - """ - nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors) - return _get_radius_from_nn(X, nn) - - -def _get_radius_k_neighbours_d( - c: npt.ArrayLike, - d: npt.ArrayLike, - n_neighbors: int = 4 -) -> typing.Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]: - """ - Determine smallest radius around `c` and `d` containing `n_neighbors` neighbors - Inspired by :func:`sklearn.feature_selection.mutual_info_regression` - """ - d = d.flatten() - n_samples = d.shape[0] - c = c.reshape((n_samples, -1)) - - radius = np.empty(n_samples) - label_counts = np.empty(n_samples) - k_all = np.empty(n_samples) - nn = NearestNeighbors(metric="chebyshev") - for label in np.unique(d): - mask = d == label - count = np.sum(mask) - if count > 1: - k = min(n_neighbors, count - 1) - nn.set_params(n_neighbors=k) - radius[mask] = _get_radius_from_nn(c[mask], nn) - k_all[mask] = k - label_counts[mask] = np.sum(mask) - return radius, label_counts, k_all - - -def _get_radius_from_nn(X: npt.ArrayLike, nn: NearestNeighbors) -> npt.ArrayLike: - nn.fit(X) - r = nn.kneighbors()[0] - return np.nextafter(r[:, -1], 0) - - -def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike) -> npt.ArrayLike: - """For each point, determine the number of other points within a given radius - Function from https://github.com/jannisteunissen/mutual_information - - :param x: ndarray, shape (n_samples, n_dim) - :param radius: radius, shape (n_samples,) - :returns: number of points within radius - - """ - kd = KDTree(x, metric="chebyshev") - nx = kd.query_radius(x, radius, count_only=True, return_distance=False) - return np.array(nx) - - -def _num_points_within_radius_cd( - c: npt.ArrayLike, - d: npt.ArrayLike, - radius: npt.ArrayLike) -> npt.ArrayLike: - """ - For each point, determine the number of other points within a given radius - Inspired by :func:`sklearn.feature_selection.mutual_info_regression` - """ - c = c.reshape((-1, 1)) - n_samples = c.shape[0] - m_all = np.empty(n_samples) - for label in np.unique(d): - mask = d == label - m_all[mask] = _num_points_within_radius(c[mask], radius[mask]) - return m_all - - -def _iterate_columns(X, columns=None): - """Iterate over columns of a matrix. - Copied from :func:`sklearn.feature_selection._mutual_info` - - Parameters - ---------- - X : ndarray or csc_matrix, shape (n_samples, n_features) - Matrix over which to iterate. - - columns : iterable or None, default=None - Indices of columns to iterate over. If None, iterate over all columns. - - Yields - ------ - x : ndarray, shape (n_samples,) - Columns of `X` in dense format. - """ - if columns is None: - columns = range(X.shape[1]) - - if issparse(X): - for i in columns: - x = np.zeros(X.shape[0]) - start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1] - x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr] - yield x - else: - for i in columns: - yield X[:, i] diff --git a/tests/test_mutual_info.py b/tests/test_mutual_info.py new file mode 100644 index 0000000..45000b0 --- /dev/null +++ b/tests/test_mutual_info.py @@ -0,0 +1,369 @@ + +# Tests in TestMI are adapted from sklearn tests and licenced under: +# BSD 3-Clause License +# +# Copyright (c) 2007-2024 The scikit-learn developers. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Tests in TestCMI are adapted from https://github.com/jannisteunissen/mutual_information and +# distributed under the following licence: +# +# BSD 3-Clause License +# +# Copyright (c) 2022, Jannis Teunissen +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The modifications of the test and the tests in TestDCMI are distributed +# under the Apache 2.0 Software license: +# +# Copyright 2023 D-Wave Systems Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +from dwave.plugins.sklearn._conditional_mutual_info import ( + _compute_mi, + _compute_cmip_c, _compute_cmip_d, + _compute_cmip_cdc, _compute_cmip_ccd +) + +from numpy.linalg import det +from sklearn.feature_selection import mutual_info_classif +from sklearn.preprocessing import scale +from sklearn.utils import check_random_state + + +class TestMI(unittest.TestCase): + def test_against_sklearn(self): + # the test will break if sklearn modifies its implementation + from sklearn.feature_selection._mutual_info import _compute_mi_cd, _compute_mi_cc + from sklearn.metrics.cluster import mutual_info_score + # Test discrete implementation + n_samples, n_ss, n_neighbors, random_state = 103, 51, 4, 15 + np.random.seed(random_state) + xi = np.random.randint(0, 11, (n_samples, )) + xj = np.hstack(( + np.random.randint(-2, 1, (n_ss, )), + np.random.randint(10, 14, (n_samples-n_ss, )))) + np.random.shuffle(xi) + np.random.shuffle(xj) + xi_c = scale(np.random.randn(n_samples), with_mean=False) + xj_c = scale(np.random.randn(n_samples), with_mean=False) + + mi_ij = _compute_mi( + xi, xj, True, True, n_neighbors=n_neighbors) + mi_ij_cl = mutual_info_score(xi, xj) + self.assertTrue( + np.allclose(mi_ij_cl, mi_ij), + msg="Discrete mutual information is not computed correctly") + + mi_ij = _compute_mi( + xi_c, xj, False, True, n_neighbors=n_neighbors) + mi_ij_cl = _compute_mi_cd(xi_c, xj, n_neighbors=n_neighbors) + self.assertTrue( + np.allclose(mi_ij_cl, mi_ij, atol=1e-5), + msg=f"Error in continuous features/discrete target MI estimation is larger than expected {abs(mi_ij_cl-mi_ij)}") + + mi_ij = _compute_mi(xi_c, xj_c, False, False, n_neighbors=n_neighbors) + mi_ij_cl = _compute_mi_cc(xi_c, xj_c, n_neighbors=n_neighbors) + self.assertTrue( + np.allclose(mi_ij_cl, mi_ij, atol=1e-5), + msg=f"Error in purely continuous MI is larger than expected {abs(mi_ij_cl-mi_ij)}") + + def test_compute_mi_dd(self): + # In discrete case computations are straightforward and can be done + # by hand on given vectors. + x = np.array([0, 1, 1, 0, 0]) + y = np.array([1, 0, 0, 0, 1]) + + H_x = H_y = -(3 / 5) * np.log(3 / 5) - (2 / 5) * np.log(2 / 5) + H_xy = -1 / 5 * np.log(1 / 5) - 2 / 5 * np.log(2 / 5) - 2 / 5 * np.log(2 / 5) + I_xy = H_x + H_y - H_xy + + self.assertAlmostEqual(_compute_mi(x, y, discrete_feature_x=True, + discrete_feature_y=True), I_xy, places=4) + + def test_compute_mi_cc(self): + # For two continuous variables a good approach is to test on bivariate + # normal distribution, where mutual information is known. + + # Mean of the distribution, irrelevant for mutual information. + mean = np.zeros(2) + + # Setup covariance matrix with correlation coeff. equal 0.5. + sigma_1 = 1 + sigma_2 = 10 + corr = 0.5 + cov = np.array( + [ + [sigma_1**2, corr * sigma_1 * sigma_2], + [corr * sigma_1 * sigma_2, sigma_2**2], + ] + ) + + # True theoretical mutual information. + I_theory = np.log(sigma_1) + np.log(sigma_2) - 0.5 * np.log(np.linalg.det(cov)) + + rng = check_random_state(0) + Z = rng.multivariate_normal(mean, cov, size=5001).astype(np.float64, copy=False) + + x, y = Z[:, 0], Z[:, 1] + + # Theory and computed values won't be very close + # We here check with a large relative tolerance + for n_neighbors in [4, 6, 8]: + I_computed = _compute_mi( + x, y, discrete_feature_x=False, discrete_feature_y=False, n_neighbors=n_neighbors + ) + self.assertLessEqual(abs(I_computed/I_theory-1.0), 1e-1) + + def test_compute_mi_cd(self): + # To test define a joint distribution as follows: + # p(x, y) = p(x) p(y | x) + # X ~ Bernoulli(p) + # (Y | x = 0) ~ Uniform(-1, 1) + # (Y | x = 1) ~ Uniform(0, 2) + + # Use the following formula for mutual information: + # I(X; Y) = H(Y) - H(Y | X) + # Two entropies can be computed by hand: + # H(Y) = -(1-p)/2 * ln((1-p)/2) - p/2*log(p/2) - 1/2*log(1/2) + # H(Y | X) = ln(2) + + # Now we need to implement sampling from out distribution, which is + # done easily using conditional distribution logic. + + n_samples = 5001 + rng = check_random_state(0) + + for p in [0.3, 0.5, 0.7]: + x = rng.uniform(size=n_samples) > p + + y = np.empty(n_samples, np.float64) + mask = x == 0 + y[mask] = rng.uniform(-1, 1, size=np.sum(mask)) + y[~mask] = rng.uniform(0, 2, size=np.sum(~mask)) + + I_theory = -0.5 * ( + (1 - p) * np.log(0.5 * (1 - p)) + p * np.log(0.5 * p) + np.log(0.5) + ) - np.log(2) + + # Assert the same tolerance. + for n_neighbors in [4, 6, 8]: + I_computed = _compute_mi( + x, y, discrete_feature_x=True, discrete_feature_y=False, n_neighbors=n_neighbors + ) + self.assertLessEqual(abs(I_computed/I_theory-1.0), 1e-1) + + def test_compute_mi_cd_unique_label(self): + # Test that adding unique label doesn't change MI. + n_samples = 100 + x = np.random.uniform(size=n_samples) > 0.5 + + y = np.empty(n_samples, np.float64) + mask = x == 0 + y[mask] = np.random.uniform(-1, 1, size=np.sum(mask)) + y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask)) + + mi_1 = _compute_mi(x, y, discrete_feature_x=True, discrete_feature_y=False) + + x = np.hstack((x, 2)) + y = np.hstack((y, 10)) + mi_2 = _compute_mi(x, y, discrete_feature_x=True, discrete_feature_y=False) + + self.assertAlmostEqual(mi_1, mi_2, places=5) + + +class TestCMI(unittest.TestCase): + def test_discrete(self): + # Third test from "Conditional Mutual Information Estimation for Mixed Discrete + # and Continuous Variables with Nearest Neighbors" (Mesner & Shalizi). Fully + # discrete + def rescale(xi): + xi = scale(xi + 0.0, with_mean=False, copy=False) + # Add small noise to continuous features as advised in Kraskov et. al. + xi = xi.astype(np.float64, copy=False) + means = np.maximum(1, np.mean(np.abs(xi), axis=0)) + xi += (1e-10 * means * rng.standard_normal(size=(xi.shape[0], ))) + return xi + N = 1001 + rng = check_random_state(0) + choices = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1]]) + p_discrete = rng.choice(4, p=[0.4, 0.4, 0.1, 0.1], size=N) + xy_discrete = choices[p_discrete] + x, y = xy_discrete[:, 0], xy_discrete[:, 1] + z = rng.poisson(2, size=N) + x_c = rescale(x) + y_c = rescale(y) + z_c = rescale(z) + cmi_analytic = 2 * 0.4 * np.log(0.4/0.5**2) + 2 * 0.1 * np.log(0.1/0.5**2) + # checking the computations with a large relative tolerance + for n_neighbors in [3, 5, 7]: + cmi_ij, _ = _compute_cmip_d(x, z, y) + self.assertLessEqual(abs(cmi_ij/cmi_analytic-1.0), 1e-1) + cmi_ij, _ = _compute_cmip_c(x_c, z_c, y_c, n_neighbors) + self.assertLessEqual(abs(cmi_ij/cmi_analytic-1.0), 1e-1) + cmi_ij, _ = _compute_cmip_ccd(x_c, z_c, y, n_neighbors) + self.assertLessEqual(abs(cmi_ij/cmi_analytic-1.0), 1e-1) + cmi_ij, _ = _compute_cmip_cdc(x_c, z, y_c, n_neighbors) + self.assertLessEqual(abs(cmi_ij/cmi_analytic-1.0), 1e-1) + + def test_bivariate(self): + """Test with bivariate normal variables""" + N = 1001 + rng = check_random_state(0) + mu = np.zeros(2) + cov = np.array([[1., 0.8], [0.8, 1.0]]) + xy_gauss = rng.multivariate_normal(mu, cov, size=N) + x, y = xy_gauss[:, 0], xy_gauss[:, 1] + z = rng.normal(size=N) + + cmi_analytic = -0.5 * np.log(det(cov)) + # checking the computations with a large relative tolerance + for n_neighbors in [4, 6, 8]: + cmi_ij, _ = _compute_cmip_c(x, z, y, n_neighbors) + self.assertAlmostEqual(cmi_ij/cmi_analytic, 1.0, places=1) + + def test_trivariate(self): + # Test with 'trivariate' normal variables x, y, z + + mu = np.zeros(3) + + # Covariance matrix + cov_xy = 0.7 + cov_xz = 0.5 + cov_yz = 0.3 + cov = np.array([[1, cov_xy, cov_xz], + [cov_xy, 1.0, cov_yz], + [cov_xz, cov_yz, 1]]) + N = 1001 + rng = check_random_state(0) + samples = rng.multivariate_normal(mu, cov, size=N) + x, y, z = samples[:, 0], samples[:, 1], samples[:, 2] + + # Construct minor matrices for x and y + cov_x = cov[1:, 1:] + cov_y = cov[[0, 2]][:, [0, 2]] + cmi_analytic = -0.5 * np.log(det(cov) / (det(cov_x) * det(cov_y))) + # checking the computations with a large relative tolerance + for n_neighbors in [4, 6, 8]: + cmi_ij, _ = _compute_cmip_c(x, z, y, n_neighbors) + self.assertAlmostEqual(cmi_ij/cmi_analytic, 1, places=1) + + +class TestDCMI(unittest.TestCase): + def test_cmi_symmetry(self): + # The pair produced by the computations should be symmetric in the first two arguments + np.random.seed(42) + n_samples, n_neighbors = 4003, 4 + # Test continuous implementation + Xy = np.random.randn(n_samples, 3) + cmi_ij_pair = _compute_cmip_c(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors) + cmi_ji_pair = _compute_cmip_c(Xy[:, 1], Xy[:, 0], Xy[:, 2], n_neighbors=n_neighbors) + self.assertAlmostEqual( + sum(cmi_ij_pair), sum(cmi_ji_pair), places=3, + msg="Computation for continuous conditional mutual information is not symmetric") + + c1 = np.random.randn(n_samples,) + c2 = np.random.randn(n_samples,) + n_ss = 1001 + d = np.hstack(( + np.random.randint(-2, 1, (n_ss, )), + np.random.randint(10, 14, (n_samples-n_ss, )))) + np.random.shuffle(d) + cmi_ij_pair = _compute_cmip_ccd(c1, c2, d, n_neighbors=n_neighbors) + cmi_ji_pair = _compute_cmip_ccd(c2, c1, d, n_neighbors=n_neighbors) + self.assertAlmostEqual( + sum(cmi_ij_pair), sum(cmi_ji_pair), places=3, + msg="Computation for mixed continuous/discrete conditional mutual information is not symmetric") + + def test_cmi_discrete(self): + # We test the algorithm using the formula `I(x;y|z) = I(x;y) + I(z;y) - I(z;y|x)`. + # It is highly likely if our algorithm is correct, then formula holds. + np.random.seed(42) + # Test discrete implementation + n_samples, n_ss, n_neighbors = 103, 51, 4 + xi = np.random.randint(0, 11, (n_samples, )) + xj = np.hstack(( + np.random.randint(-2, 1, (n_ss, )), + np.random.randint(10, 14, (n_samples-n_ss, )))) + np.random.shuffle(xi) + np.random.shuffle(xj) + y = np.random.randint(-12, -10, (n_samples, )) + cmi_ij, cmi_ji = _compute_cmip_d(xi, xj, y) + # we estimate mutual information using + mi_i = mutual_info_classif( + xi.reshape(-1, 1), y, discrete_features=True, n_neighbors=n_neighbors) + mi_j = mutual_info_classif( + xj.reshape(-1, 1), y, discrete_features=True, n_neighbors=n_neighbors) + + self.assertTrue( + np.allclose(cmi_ij + mi_j - mi_i, cmi_ji, atol=1e-5), + msg="The chain rule for discrete conditional mutual information is violated") + cmi_ij2 = _compute_cmip_d(xj, xi, y) + self.assertTrue( + np.allclose(sum(cmi_ij2), cmi_ji+cmi_ij, atol=1e-5), + msg="Discrete conditional mutual information computation is not symmetric") diff --git a/tests/test_utilities.py b/tests/test_utilities.py index 364a64b..5b469e5 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -50,16 +50,7 @@ import numpy as np -from dwave.plugins.sklearn.utilities import ( - corrcoef, cov, dot_2d, - _compute_mi, - _compute_cmip_c, _compute_cmip_d, - _compute_cmip_cdc, _compute_cmip_ccd -) -from scipy.special import digamma -from sklearn.feature_selection import mutual_info_classif, mutual_info_regression -from sklearn.neighbors import KDTree -from sklearn.preprocessing import scale +from dwave.plugins.sklearn.utilities import corrcoef, cov, dot_2d class TestCorrCoef(unittest.TestCase): @@ -186,186 +177,3 @@ def test_memmap(self): out = np.memmap(fout, "float64", mode="w+", shape=(X.shape[0], X.shape[0])) dot_2d(X, X.T, out=out) - - -class TestMI(unittest.TestCase): - def test_cmi_mixed(self): - np.random.seed(42) - n_samples, n_neighbors = 4003, 4 - c1 = np.random.randn(n_samples,) - n_ss = 1001 - d1 = np.hstack(( - np.random.randint(-2, 1, (n_ss, )), - np.random.randint(10, 14, (n_samples-n_ss, )))) - d2 = np.random.randint(0, 4, (n_samples, )) - np.random.shuffle(d2) - cmi_ij_pair = _compute_cmip_c(c1, d1, d2, n_neighbors=n_neighbors) - cmi_ij_pair_cdc = _compute_cmip_cdc(c1, d1, d2, n_neighbors=n_neighbors) - cmi_ij_pair_ccd = _compute_cmip_ccd(c1, d1, d2, n_neighbors=n_neighbors) - self.assertAlmostEqual( - sum(cmi_ij_pair), sum(cmi_ij_pair_cdc), places=4, - msg="Computation for mixed continuous/discrete conditional mutual information is not accurate") - self.assertAlmostEqual( - sum(cmi_ij_pair), sum(cmi_ij_pair_ccd), places=4, - msg="Computation for mixed continuous/discrete conditional mutual information is not accurate") - - def test_cmi_symmetry(self): - np.random.seed(42) - n_samples, n_neighbors = 4003, 4 - # Test continuous implementation - Xy = np.random.randn(n_samples, 3) - cmi_ij_pair = _compute_cmip_c(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors) - cmi_ji_pair = _compute_cmip_c(Xy[:, 1], Xy[:, 0], Xy[:, 2], n_neighbors=n_neighbors) - self.assertAlmostEqual( - sum(cmi_ij_pair), sum(cmi_ji_pair), places=3, - msg="Computation for continuous conditional mutual information is not symmetric") - - c1 = np.random.randn(n_samples,) - c2 = np.random.randn(n_samples,) - n_ss = 1001 - d = np.hstack(( - np.random.randint(-2, 1, (n_ss, )), - np.random.randint(10, 14, (n_samples-n_ss, )))) - np.random.shuffle(d) - cmi_ij_pair = _compute_cmip_ccd(c1, c2, d, n_neighbors=n_neighbors) - cmi_ji_pair = _compute_cmip_ccd(c2, c1, d, n_neighbors=n_neighbors) - self.assertAlmostEqual( - sum(cmi_ij_pair), sum(cmi_ji_pair), places=3, - msg="Computation for mixed continuous/discrete conditional mutual information is not symmetric") - - def test_cmi_discrete(self): - """ - We test the algorithm using the formula `I(x;y|z) = I(x;y) + I(z;y) - I(z;y|x)`. - Since the mutual information data is an sklearn function, - :func:`sklearn.feature_selection._mutual_info.mutual_info_classif` - it is highly likely that formula is valid only if our algorithm is correct. - """ - np.random.seed(42) - # Test discrete implementation - n_samples, n_ss, n_neighbors = 103, 51, 4 - xi = np.random.randint(0, 11, (n_samples, )) - xj = np.hstack(( - np.random.randint(-2, 1, (n_ss, )), - np.random.randint(10, 14, (n_samples-n_ss, )))) - np.random.shuffle(xi) - np.random.shuffle(xj) - y = np.random.randint(-12, -10, (n_samples, )) - cmi_ij, cmi_ji = _compute_cmip_d(xi, xj, y) - mi_i = mutual_info_classif( - xi.reshape(-1, 1), y, discrete_features=True, n_neighbors=n_neighbors) - mi_j = mutual_info_classif( - xj.reshape(-1, 1), y, discrete_features=True, n_neighbors=n_neighbors) - self.assertTrue( - np.allclose(cmi_ij + mi_j - mi_i, cmi_ji, atol=1e-5), - msg="The chain rule for discrete conditional mutual information is violated") - cmi_ij2 = _compute_cmip_d(xj, xi, y) - self.assertTrue( - np.allclose(sum(cmi_ij2), cmi_ji+cmi_ij, atol=1e-5), - msg="Discrete conditional mutual information computation is not symmetric") - - def test_mi(self): - from sklearn.feature_selection._mutual_info import _compute_mi_cd, _compute_mi_cc - from sklearn.metrics.cluster import mutual_info_score - # Test discrete implementation - n_samples, n_ss, n_neighbors, random_state = 103, 51, 4, 15 - np.random.seed(random_state) - xi = np.random.randint(0, 11, (n_samples, )) - xj = np.hstack(( - np.random.randint(-2, 1, (n_ss, )), - np.random.randint(10, 14, (n_samples-n_ss, )))) - np.random.shuffle(xi) - np.random.shuffle(xj) - xi_c = scale(np.random.randn(n_samples), with_mean=False) - xj_c = scale(np.random.randn(n_samples), with_mean=False) - - mi_ij = _compute_mi( - xi, xj, True, True, n_neighbors=n_neighbors) - mi_ij_cl = mutual_info_score(xi, xj) - self.assertTrue( - np.allclose(mi_ij_cl, mi_ij), - msg="Discrete mutual information is not computed correctly") - - mi_ij = _compute_mi( - xi_c, xj, False, True, n_neighbors=n_neighbors) - mi_ij_cl = _compute_mi_cd(xi_c, xj, n_neighbors=n_neighbors) - self.assertTrue( - np.allclose(mi_ij_cl, mi_ij, atol=1e-5), - msg=f"Error in continuous features/discrete target MI estimation is larger than expected {abs(mi_ij_cl-mi_ij)}") - - mi_ij = _compute_mi(xi_c, xj_c, False, False, n_neighbors=n_neighbors) - mi_ij_cl = _compute_mi_cc(xi_c, xj_c, n_neighbors=n_neighbors) - self.assertTrue( - np.allclose(mi_ij_cl, mi_ij, atol=1e-5), - msg=f"Error in purely continuous MI is larger than expected {abs(mi_ij_cl-mi_ij)}") - - def test_cmi_continuous(self): - # methods from https://github.com/jannisteunissen/mutual_information - def _compute_cmi_t(x, z, y, n_neighbors): - n_samples = len(x) - - x = x.reshape(-1, 1) - y = y.reshape(-1, 1) - z = z.reshape(-1, 1) - xyz = np.hstack((x, y, z)) - k = np.full(n_samples, n_neighbors) - radius = get_radius_kneighbors(xyz, n_neighbors) - - mask = (radius == 0) - if mask.sum() > 0: - vals, ix, counts = np.unique(xyz[mask], axis=0, - return_inverse=True, - return_counts=True) - k[mask] = counts[ix] - 1 - - nxz = num_points_within_radius(np.hstack((x, z)), radius) - nyz = num_points_within_radius(np.hstack((y, z)), radius) - nz = num_points_within_radius(z, radius) - - cmi = max(0, np.mean(digamma(k)) - np.mean(digamma(nxz + 1)) - - np.mean(digamma(nyz + 1)) + np.mean(digamma(nz + 1))) - return cmi - - def get_radius_kneighbors(x, n_neighbors): - """Determine smallest radius around x containing n_neighbors neighbors - - :param x: ndarray, shape (n_samples, n_dim) - :param n_neighbors: number of neighbors - :returns: radius, shape (n_samples,) - """ - # Use KDTree for simplicity (sometimes a ball tree could be faster) - kd = KDTree(x, metric="chebyshev") - - # Results include point itself, therefore n_neighbors+1 - neigh_dist = kd.query(x, k=n_neighbors+1)[0] - - # Take radius slightly larger than distance to last neighbor - radius = np.nextafter(neigh_dist[:, -1], 0) - return radius - - def num_points_within_radius(x, radius): - """For each point, determine the number of other points within a given radius - - :param x: ndarray, shape (n_samples, n_dim) - :param radius: radius, shape (n_samples,) - :returns: number of points within radius - """ - kd = KDTree(x, metric="chebyshev") - nx = kd.query_radius(x, radius, count_only=True, return_distance=False) - return np.array(nx) - 1.0 - - np.random.seed(42) - n_samples, n_neighbors = 103, 4 - # Test continuous implementation - Xy = np.random.randn(n_samples, 3) - cmi_t_01 = _compute_cmi_t(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors) - cmi_t_10 = _compute_cmi_t(Xy[:, 1], Xy[:, 0], Xy[:, 2], n_neighbors=n_neighbors) - - cmi_ij, cmi_ji = _compute_cmip_c(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors) - - self.assertAlmostEqual( - max(cmi_t_01, 0), cmi_ij, places=5, - msg="The algorithm doesn't match the original implementation") - - self.assertAlmostEqual( - max(cmi_t_10, 0), cmi_ji, places=5, - msg="The algorithm doesn't match the original implementation") From 4d29725e284f4c35eb7c6d0f82af7e3591ce43cf Mon Sep 17 00:00:00 2001 From: Aivar Sootla Date: Wed, 24 Apr 2024 11:03:13 +0100 Subject: [PATCH 10/12] import bug fix --- dwave/plugins/sklearn/_conditional_mutual_info.py | 1 + dwave/plugins/sklearn/transformers.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/dwave/plugins/sklearn/_conditional_mutual_info.py b/dwave/plugins/sklearn/_conditional_mutual_info.py index 9e5a546..36eec3c 100644 --- a/dwave/plugins/sklearn/_conditional_mutual_info.py +++ b/dwave/plugins/sklearn/_conditional_mutual_info.py @@ -55,6 +55,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from itertools import combinations import typing import numpy as np diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py index db444ee..aa20d06 100644 --- a/dwave/plugins/sklearn/transformers.py +++ b/dwave/plugins/sklearn/transformers.py @@ -14,7 +14,6 @@ from __future__ import annotations -from itertools import combinations import tempfile import typing From 9560f1dea31e1b709793e71de435ba7493c5ff34 Mon Sep 17 00:00:00 2001 From: Aivar Sootla Date: Wed, 24 Apr 2024 15:31:09 +0100 Subject: [PATCH 11/12] making mi cqm consistent with the correlation based algorithm --- dwave/plugins/sklearn/transformers.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py index aa20d06..91795ac 100644 --- a/dwave/plugins/sklearn/transformers.py +++ b/dwave/plugins/sklearn/transformers.py @@ -239,13 +239,11 @@ def mutual_information_cqm( Class labels formatted as a numerical 1D array-like. alpha: Hyperparameter between 0 and 1 that controls the relative weight of - the relevance and redundancy terms. - ``alpha=0`` places no weight on the quality of the features, - therefore the features will be selected as to minimize the - redundancy without any consideration to quality. - ``alpha=1`` places the maximum weight on the quality of the features, + the relevance and redundancy terms. Active if conditional=False + ``alpha=0`` places the maximum weight on the feature redundancy. + ``alpha=1`` places no weight on the feature redudancy, and therefore will be equivalent to using - :class:`sklearn.feature_selection.SelectKBest`. + :class:`sklearn.feature_selection.SelectKBest` the with mutual information metric. num_features: The number of features to select. strict: @@ -312,8 +310,8 @@ def mutual_information_cqm( # method from [1]_. # mutpliypling off-diagonal ones with -1 diagonal = -np.diag(mi).copy() - # mutpliypling off-diagonal ones with alpha - np.multiply(mi, alpha, out=mi) + # mutpliypling off-diagonal ones with 1-alpha + np.multiply(mi, (1-alpha), out=mi) np.fill_diagonal(mi, diagonal) it = np.nditer(mi, flags=['multi_index'], op_flags=[['readonly']]) From 85aa2fba1551a740d6ce8acb37834fdffca11988 Mon Sep 17 00:00:00 2001 From: Aivar Sootla Date: Fri, 26 Apr 2024 12:06:47 +0000 Subject: [PATCH 12/12] add alpha tuning dial to CMI method --- dwave/plugins/sklearn/transformers.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py index 91795ac..1cb8f50 100644 --- a/dwave/plugins/sklearn/transformers.py +++ b/dwave/plugins/sklearn/transformers.py @@ -227,7 +227,7 @@ def mutual_information_cqm( If ``conditional`` is True then the conditional mutual information criterion from [2] is used, and if ``conditional`` is False then - mutual information based criterion from [1] is used. + mutual information based criteria from [1] are used. For computation of mutual information and conditional mutual information @@ -239,11 +239,14 @@ def mutual_information_cqm( Class labels formatted as a numerical 1D array-like. alpha: Hyperparameter between 0 and 1 that controls the relative weight of - the relevance and redundancy terms. Active if conditional=False + the relevance and redundancy terms. ``alpha=0`` places the maximum weight on the feature redundancy. ``alpha=1`` places no weight on the feature redudancy, and therefore will be equivalent to using - :class:`sklearn.feature_selection.SelectKBest` the with mutual information metric. + :class:`sklearn.feature_selection.SelectKBest`. + If conditional=True, ``alpha = 0`` is a default value in [2]_. + If conditional=False, ``alpha = (num_features - 1) / num_features`` + is a default value in [1]_. num_features: The number of features to select. strict: @@ -303,16 +306,17 @@ def mutual_information_cqm( random_state=random_state, n_jobs=n_jobs, conditional=conditional) + diagonal = -np.diag(mi).copy() if conditional: - # method from [2]_. - np.multiply(mi, -1, out=mi) + # method from [2]_ with another tuning dial. + # mutpliypling off-diagonal ones with -(1-alpha) + np.multiply(mi, -(1-alpha), out=mi) else: # method from [1]_. - # mutpliypling off-diagonal ones with -1 - diagonal = -np.diag(mi).copy() # mutpliypling off-diagonal ones with 1-alpha - np.multiply(mi, (1-alpha), out=mi) - np.fill_diagonal(mi, diagonal) + np.multiply(mi, 1-alpha, out=mi) + # keeping the diagonal + np.fill_diagonal(mi, diagonal) it = np.nditer(mi, flags=['multi_index'], op_flags=[['readonly']]) cqm.set_objective((*it.multi_index, x) for x in it if x)