From 39ef17984df3989a9c6dc68a5deabf9384adece0 Mon Sep 17 00:00:00 2001
From: lbluque <lbluque@berkeley.edu>
Date: Mon, 9 May 2022 05:38:38 -0700
Subject: [PATCH 01/11] rename MixedL0

---
 sparselm/model/mixedL0.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sparselm/model/mixedL0.py b/sparselm/model/mixedL0.py
index 09d6c9a..7a93af8 100644
--- a/sparselm/model/mixedL0.py
+++ b/sparselm/model/mixedL0.py
@@ -18,7 +18,7 @@
 from sparselm.model.base import CVXEstimator
 
 
-class mixedL0(CVXEstimator, metaclass=ABCMeta):
+class MixedL0(CVXEstimator, metaclass=ABCMeta):
     """Abstract base class for mixed L0 regularization models: L1L0 and L2L0.
     """
     def __init__(self, alpha=1.0, l0_ratio=0.5, big_M=1000, hierarchy=None,
@@ -148,7 +148,7 @@ def _gen_hierarchy_constraints(self):
                 for sub_id in sub_ids]
 
 
-class L1L0(mixedL0):
+class L1L0(MixedL0):
     """
     Estimator with L1L0 regularization solved with mixed integer programming
     as discussed in:
@@ -241,7 +241,7 @@ def _gen_objective(self, X, y):
         return objective
 
 
-class L2L0(mixedL0):
+class L2L0(MixedL0):
     """
     Estimator with L1L0 regularization solved with mixed integer programming
     proposed by Peichen Zhong.

From 6c10323a0fc6f0e62077d3472e6c1c67ad206449 Mon Sep 17 00:00:00 2001
From: lbluque <lbluque@berkeley.edu>
Date: Fri, 13 May 2022 11:33:47 -0700
Subject: [PATCH 02/11] refactor miqp models

---
 sparselm/model/__init__.py                    |  26 +++--
 sparselm/model/miqp/__init__.py               |   0
 sparselm/model/{ => miqp}/best_subset.py      |   3 +-
 .../{mixedL0.py => miqp/regularized_l0.py}    | 103 ++++++++++++++++++
 4 files changed, 119 insertions(+), 13 deletions(-)
 create mode 100644 sparselm/model/miqp/__init__.py
 rename sparselm/model/{ => miqp}/best_subset.py (97%)
 rename sparselm/model/{mixedL0.py => miqp/regularized_l0.py} (69%)

diff --git a/sparselm/model/__init__.py b/sparselm/model/__init__.py
index 00b4557..8e6a22d 100644
--- a/sparselm/model/__init__.py
+++ b/sparselm/model/__init__.py
@@ -2,21 +2,23 @@
 
 from sparselm.model.ols import OrdinaryLeastSquares
 from sparselm.model.lasso import Lasso
-from sparselm.model.mixedL0 import L1L0, L2L0
+from sparselm.model.miqp.best_subset import BestSubsetSelection
+from sparselm.model.miqp.regularized_l0 import L1L0, L2L0
 from sparselm.model.lasso import Lasso, GroupLasso, OverlapGroupLasso, SparseGroupLasso
 from sparselm.model.adaptive_lasso import AdaptiveLasso, AdaptiveGroupLasso, \
     AdaptiveOverlapGroupLasso, AdaptiveSparseGroupLasso
 
 __all__ = [
-    'OrdinaryLeastSquares',
-    'Lasso',
-    'L1L0',
-    'L2L0',
-    'GroupLasso',
-    'OverlapGroupLasso',
-    'SparseGroupLasso',
-    'AdaptiveLasso',
-    'AdaptiveGroupLasso',
-    'AdaptiveOverlapGroupLasso',
-    'AdaptiveSparseGroupLasso',
+    "OrdinaryLeastSquares",
+    "Lasso",
+    "BestSubsetSelection",
+    "L1L0",
+    "L2L0",
+    "GroupLasso",
+    "OverlapGroupLasso",
+    "SparseGroupLasso",
+    "AdaptiveLasso",
+    "AdaptiveGroupLasso",
+    "AdaptiveOverlapGroupLasso",
+    "AdaptiveSparseGroupLasso",
 ]
diff --git a/sparselm/model/miqp/__init__.py b/sparselm/model/miqp/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sparselm/model/best_subset.py b/sparselm/model/miqp/best_subset.py
similarity index 97%
rename from sparselm/model/best_subset.py
rename to sparselm/model/miqp/best_subset.py
index 882c235..61322cf 100644
--- a/sparselm/model/best_subset.py
+++ b/sparselm/model/miqp/best_subset.py
@@ -8,9 +8,10 @@
 
 import cvxpy as cp
 from cvxpy.atoms.affine.wraps import psd_wrap
-from .base import CVXEstimator
+from sparselm.model.base import CVXEstimator
 
 
+# TODO implement with l1 and l2 regularization grouped with less slack vars and equality constraints
 class BestSubsetSelection(CVXEstimator):
     """MIQP Best Subset Selection estimator
 
diff --git a/sparselm/model/mixedL0.py b/sparselm/model/miqp/regularized_l0.py
similarity index 69%
rename from sparselm/model/mixedL0.py
rename to sparselm/model/miqp/regularized_l0.py
index 7a93af8..550a330 100644
--- a/sparselm/model/mixedL0.py
+++ b/sparselm/model/miqp/regularized_l0.py
@@ -13,11 +13,13 @@
 
 import warnings
 from abc import ABCMeta
+import numpy as np
 import cvxpy as cp
 from cvxpy.atoms.affine.wraps import psd_wrap
 from sparselm.model.base import CVXEstimator
 
 
+# TODO make RegularizedL0 as a base class and then derive
 class MixedL0(CVXEstimator, metaclass=ABCMeta):
     """Abstract base class for mixed L0 regularization models: L1L0 and L2L0.
     """
@@ -253,7 +255,108 @@ class L2L0(MixedL0):
 
     def _gen_objective(self, X, y):
         """Generate the objective function used in l2l0 regression model"""
+        c0 = 2 * X.shape[0]  # keeps hyperparameter scale independent
+        objective = super()._gen_objective(X, y) \
+            + c0 * self._lambda1 * cp.sum_squares(self._beta)
+
+        return objective
+
+
+class GroupedL0(MixedL0, metaclass=ABCMeta):
+    """Grouped L0 norm"""
+    def __init__(self, groups, alpha=1.0, l0_ratio=0.5, big_M=1000, hierarchy=None,
+                 ignore_psd_check=True, fit_intercept=False, normalize=False,
+                 copy_X=True, warm_start=False, solver=None, **kwargs):
+        """
+        Args:
+            groups (list or ndarray):
+                array-like of integers specifying groups. Length should be the
+                same as model, where each integer entry specifies the group
+                each parameter corresponds to.
+            alpha (float):
+                Regularization hyper-parameter.
+            l0_ratio (float):
+                Mixing parameter between l1 and l0 regularization.
+            big_M (float):
+                Upper bound on the norm of coefficients associated with each
+                cluster (groups of coefficients) ||Beta_c||_2
+            hierarchy (list):
+                A list of lists of integers storing hierarchy relations between
+                coefficients.
+                Each sublist contains indices of other coefficients
+                on which the coefficient associated with each element of
+                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
+                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
+                dependence.
+            ignore_psd_check (bool):
+                Wether to ignore cvxpy's PSD checks  of matrix used in quadratic
+                form. Default is True to avoid raising errors for poorly
+                conditioned matrices. But if you want to be strict set to False.
+            fit_intercept (bool):
+                Whether the intercept should be estimated or not.
+                If False, the data is assumed to be already centered.
+            normalize (bool):
+                This parameter is ignored when fit_intercept is set to False.
+                If True, the regressors X will be normalized before regression
+                by subtracting the mean and dividing by the l2-norm.
+                If you wish to standardize, please use StandardScaler before
+                calling fit on an estimator with normalize=False
+            copy_X (bool):
+                If True, X will be copied; else, it may be overwritten.
+            warm_start (bool):
+                When set to True, reuse the solution of the previous call to
+                fit as initialization, otherwise, just erase the previous
+                solution.
+            solver (str):
+                cvxpy backend solver to use. Supported solvers are:
+                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
+                GLPK and GLPK_MI (via CVXOPT GLPK interface)
+            **kwargs:
+                Kewyard arguments passed to cvxpy solve.
+                See docs linked above for more information.
+        """
+        super().__init__(alpha=alpha, l0_ratio=l0_ratio, big_M=big_M,
+                         hierarchy=hierarchy, ignore_psd_check=ignore_psd_check,
+                         fit_intercept=fit_intercept,
+                         normalize=normalize, copy_X=copy_X,
+                         warm_start=warm_start, solver=solver, **kwargs)
+        self.groups = np.asarray(groups)
+        self._group_masks = [self.groups == i for i in np.unique(groups)]
+        self._z0 = cp.Variable(len(self._group_masks), boolean=True)
+
+    def _gen_objective(self, X, y):
+        """Generate the quadratic form portion of objective"""
         c0 = 2 * X.shape[0] # keeps hyperparameter scale independent
+        XTX = psd_wrap(X.T @ X) if self.ignore_psd_check else X.T @ X
+        objective = cp.quad_form(self._beta, XTX) - 2 * y.T @ X @ self._beta \
+            + c0 * self._lambda0 * cp.sum(self._z0)
+        return objective
+
+    def _gen_constraints(self, X, y):
+        """Generate the constraints used to solve l0 regularization"""
+        constraints = []
+        for i, mask in enumerate(self._group_masks):
+            constraints += [self._big_M * self._z0[i] >= self._beta[mask],
+                            self._big_M * self._z0[i] >= -self._beta[mask]]
+
+        if self.hierarchy is not None:
+            constraints += self._gen_hierarchy_constraints()
+        return constraints
+
+
+class GroupedL2L0(GroupedL0):
+    """
+    Estimator with L1L0 regularization solved with mixed integer programming
+    proposed by Peichen Zhong.
+
+    Regularized model is:
+        ||X * Beta - y||^2 + alpha * l0_ratio * ||Beta||_0
+                           + alpha * (1 - l0_ratio) * ||Beta||^2_2
+    """
+
+    def _gen_objective(self, X, y):
+        """Generate the objective function used in l2l0 regression model"""
+        c0 = 2 * X.shape[0]  # keeps hyperparameter scale independent
         objective = super()._gen_objective(X, y) \
             + c0 * self._lambda1 * cp.sum_squares(self._beta)
 

From 364bd8784c06415f1075a141d302cfc73fdd8251 Mon Sep 17 00:00:00 2001
From: lbluque <lbluque@berkeley.edu>
Date: Sun, 15 May 2022 12:51:56 -0700
Subject: [PATCH 03/11] refactor mixed l0 from regularizedl0 base

---
 sparselm/model/__init__.py            |   3 +-
 sparselm/model/miqp/regularized_l0.py | 128 +++++++++++++++++++-------
 2 files changed, 97 insertions(+), 34 deletions(-)

diff --git a/sparselm/model/__init__.py b/sparselm/model/__init__.py
index 8e6a22d..d6ad531 100644
--- a/sparselm/model/__init__.py
+++ b/sparselm/model/__init__.py
@@ -3,7 +3,7 @@
 from sparselm.model.ols import OrdinaryLeastSquares
 from sparselm.model.lasso import Lasso
 from sparselm.model.miqp.best_subset import BestSubsetSelection
-from sparselm.model.miqp.regularized_l0 import L1L0, L2L0
+from sparselm.model.miqp.regularized_l0 import L1L0, L2L0, RegularizedL0
 from sparselm.model.lasso import Lasso, GroupLasso, OverlapGroupLasso, SparseGroupLasso
 from sparselm.model.adaptive_lasso import AdaptiveLasso, AdaptiveGroupLasso, \
     AdaptiveOverlapGroupLasso, AdaptiveSparseGroupLasso
@@ -12,6 +12,7 @@
     "OrdinaryLeastSquares",
     "Lasso",
     "BestSubsetSelection",
+    "RegularizedL0",
     "L1L0",
     "L2L0",
     "GroupLasso",
diff --git a/sparselm/model/miqp/regularized_l0.py b/sparselm/model/miqp/regularized_l0.py
index 550a330..ae84bcb 100644
--- a/sparselm/model/miqp/regularized_l0.py
+++ b/sparselm/model/miqp/regularized_l0.py
@@ -19,19 +19,16 @@
 from sparselm.model.base import CVXEstimator
 
 
-# TODO make RegularizedL0 as a base class and then derive
-class MixedL0(CVXEstimator, metaclass=ABCMeta):
-    """Abstract base class for mixed L0 regularization models: L1L0 and L2L0.
+class RegularizedL0(CVXEstimator):
+    """Implementation of l0 regularized estimator.
     """
-    def __init__(self, alpha=1.0, l0_ratio=0.5, big_M=1000, hierarchy=None,
-                 ignore_psd_check=True, fit_intercept=False, normalize=False,
-                 copy_X=True, warm_start=False, solver=None, **kwargs):
+    def __init__(self, alpha=1.0, big_M=1000, hierarchy=None, ignore_psd_check=True,
+                 fit_intercept=False, normalize=False, copy_X=True, warm_start=False,
+                 solver=None, **kwargs):
         """
         Args:
             alpha (float):
                 Regularization hyper-parameter.
-            l0_ratio (float):
-                Mixing parameter between l1 and l0 regularization.
             big_M (float):
                 Upper bound on the norm of coefficients associated with each
                 cluster (groups of coefficients) ||Beta_c||_2
@@ -74,20 +71,10 @@ def __init__(self, alpha=1.0, l0_ratio=0.5, big_M=1000, hierarchy=None,
                          copy_X=copy_X, warm_start=warm_start, solver=solver,
                          **kwargs)
 
-        if not 0 <= l0_ratio <= 1:
-            raise ValueError('l0_ratio must be between 0 and 1.')
-        elif l0_ratio == 0.0:
-            warnings.warn(
-                "It's more efficient to use Ridge/Lasso instead of l0_ratio=0",
-                UserWarning)
-
         self.hierarchy = hierarchy
         self._alpha = alpha
+        self._lambda0 = cp.Parameter(nonneg=True, value=alpha)
         self._big_M = cp.Parameter(nonneg=True, value=big_M)
-        self._lambda0 = cp.Parameter(nonneg=True, value=l0_ratio * alpha)
-        self._lambda1 = cp.Parameter(nonneg=True, value=(1 - l0_ratio) * alpha)
-        #  save exact value so sklearn clone is happy dappy
-        self._l0_ratio = l0_ratio
         self.ignore_psd_check = ignore_psd_check
         self._z0 = None
 
@@ -98,8 +85,7 @@ def alpha(self):
     @alpha.setter
     def alpha(self, val):
         self._alpha = val
-        self._lambda0.value = self.l0_ratio * val
-        self._lambda1.value = (1 - self.l0_ratio) * val
+        self._lambda0.value = val
 
     @property
     def big_M(self):
@@ -109,18 +95,6 @@ def big_M(self):
     def big_M(self, val):
         self._big_M.value = val
 
-    @property
-    def l0_ratio(self):
-        return self._l0_ratio
-
-    @l0_ratio.setter
-    def l0_ratio(self, val):
-        if not 0 <= val <= 1:
-            raise ValueError('l0_ratio must be between 0 and 1.')
-        self._l0_ratio = val
-        self._lambda0.value = val * self.alpha
-        self._lambda1.value = (1 - val) * self.alpha
-
     def _gen_objective(self, X, y):
         """Generate the quadratic form portion of objective"""
         # psd_wrap will ignore cvxpy PSD checks, without it errors will
@@ -150,6 +124,94 @@ def _gen_hierarchy_constraints(self):
                 for sub_id in sub_ids]
 
 
+class MixedL0(RegularizedL0, metaclass=ABCMeta):
+    """Abstract base class for mixed L0 regularization models: L1L0 and L2L0.
+    """
+    def __init__(self, alpha=1.0, l0_ratio=0.5, big_M=1000, hierarchy=None,
+                 ignore_psd_check=True, fit_intercept=False, normalize=False,
+                 copy_X=True, warm_start=False, solver=None, **kwargs):
+        """
+        Args:
+            alpha (float):
+                Regularization hyper-parameter.
+            l0_ratio (float):
+                Mixing parameter between l1 and l0 regularization.
+            big_M (float):
+                Upper bound on the norm of coefficients associated with each
+                cluster (groups of coefficients) ||Beta_c||_2
+            hierarchy (list):
+                A list of lists of integers storing hierarchy relations between
+                coefficients.
+                Each sublist contains indices of other coefficients
+                on which the coefficient associated with each element of
+                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
+                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
+                dependence.
+            ignore_psd_check (bool):
+                Wether to ignore cvxpy's PSD checks  of matrix used in quadratic
+                form. Default is True to avoid raising errors for poorly
+                conditioned matrices. But if you want to be strict set to False.
+            fit_intercept (bool):
+                Whether the intercept should be estimated or not.
+                If False, the data is assumed to be already centered.
+            normalize (bool):
+                This parameter is ignored when fit_intercept is set to False.
+                If True, the regressors X will be normalized before regression
+                by subtracting the mean and dividing by the l2-norm.
+                If you wish to standardize, please use StandardScaler before
+                calling fit on an estimator with normalize=False
+            copy_X (bool):
+                If True, X will be copied; else, it may be overwritten.
+            warm_start (bool):
+                When set to True, reuse the solution of the previous call to
+                fit as initialization, otherwise, just erase the previous
+                solution.
+            solver (str):
+                cvxpy backend solver to use. Supported solvers are:
+                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
+                GLPK and GLPK_MI (via CVXOPT GLPK interface)
+            **kwargs:
+                Kewyard arguments passed to cvxpy solve.
+                See docs linked above for more information.
+        """
+        super().__init__(
+            alpha=alpha, big_M=big_M, hierarchy=hierarchy,
+            ignore_psd_check=ignore_psd_check, fit_intercept=fit_intercept,
+            normalize=normalize, copy_X=copy_X, warm_start=warm_start, solver=solver,
+            **kwargs
+        )
+
+        if not 0 <= l0_ratio <= 1:
+            raise ValueError('l0_ratio must be between 0 and 1.')
+        elif l0_ratio == 0.0:
+            warnings.warn(
+                "It's more efficient to use Ridge/Lasso instead of l0_ratio=0",
+                UserWarning)
+
+        self._lambda0.value = l0_ratio * alpha
+        self._lambda1 = cp.Parameter(nonneg=True, value=(1 - l0_ratio) * alpha)
+        #  save exact value so sklearn clone is happy dappy
+        self._l0_ratio = l0_ratio
+
+    @RegularizedL0.alpha.setter
+    def alpha(self, val):
+        self._alpha = val
+        self._lambda0.value = self.l0_ratio * val
+        self._lambda1.value = (1 - self.l0_ratio) * val
+
+    @property
+    def l0_ratio(self):
+        return self._l0_ratio
+
+    @l0_ratio.setter
+    def l0_ratio(self, val):
+        if not 0 <= val <= 1:
+            raise ValueError('l0_ratio must be between 0 and 1.')
+        self._l0_ratio = val
+        self._lambda0.value = val * self.alpha
+        self._lambda1.value = (1 - val) * self.alpha
+
+
 class L1L0(MixedL0):
     """
     Estimator with L1L0 regularization solved with mixed integer programming

From ef5e9fccf1ac61a84ab1ac69b848d389bb569207 Mon Sep 17 00:00:00 2001
From: lbluque <lbluque@berkeley.edu>
Date: Sun, 15 May 2022 17:17:36 -0700
Subject: [PATCH 04/11] regularized grouped l0 and l2l0

---
 sparselm/model/miqp/regularized_l0.py | 38 +++++++++++++++------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/sparselm/model/miqp/regularized_l0.py b/sparselm/model/miqp/regularized_l0.py
index ae84bcb..0e70438 100644
--- a/sparselm/model/miqp/regularized_l0.py
+++ b/sparselm/model/miqp/regularized_l0.py
@@ -307,7 +307,7 @@ def _gen_objective(self, X, y):
 
 class L2L0(MixedL0):
     """
-    Estimator with L1L0 regularization solved with mixed integer programming
+    Estimator with L2L0 regularization solved with mixed integer programming
     proposed by Peichen Zhong.
 
     Regularized model is:
@@ -324,9 +324,9 @@ def _gen_objective(self, X, y):
         return objective
 
 
-class GroupedL0(MixedL0, metaclass=ABCMeta):
+class GroupedL0(RegularizedL0, metaclass=ABCMeta):
     """Grouped L0 norm"""
-    def __init__(self, groups, alpha=1.0, l0_ratio=0.5, big_M=1000, hierarchy=None,
+    def __init__(self, groups, alpha=1.0, big_M=1000, hierarchy=None,
                  ignore_psd_check=True, fit_intercept=False, normalize=False,
                  copy_X=True, warm_start=False, solver=None, **kwargs):
         """
@@ -337,8 +337,6 @@ def __init__(self, groups, alpha=1.0, l0_ratio=0.5, big_M=1000, hierarchy=None,
                 each parameter corresponds to.
             alpha (float):
                 Regularization hyper-parameter.
-            l0_ratio (float):
-                Mixing parameter between l1 and l0 regularization.
             big_M (float):
                 Upper bound on the norm of coefficients associated with each
                 cluster (groups of coefficients) ||Beta_c||_2
@@ -377,11 +375,12 @@ def __init__(self, groups, alpha=1.0, l0_ratio=0.5, big_M=1000, hierarchy=None,
                 Kewyard arguments passed to cvxpy solve.
                 See docs linked above for more information.
         """
-        super().__init__(alpha=alpha, l0_ratio=l0_ratio, big_M=big_M,
-                         hierarchy=hierarchy, ignore_psd_check=ignore_psd_check,
-                         fit_intercept=fit_intercept,
-                         normalize=normalize, copy_X=copy_X,
-                         warm_start=warm_start, solver=solver, **kwargs)
+        super().__init__(
+            alpha=alpha, big_M=big_M, hierarchy=hierarchy,
+            ignore_psd_check=ignore_psd_check, fit_intercept=fit_intercept,
+            normalize=normalize, copy_X=copy_X, warm_start=warm_start, solver=solver,
+            **kwargs)
+
         self.groups = np.asarray(groups)
         self._group_masks = [self.groups == i for i in np.unique(groups)]
         self._z0 = cp.Variable(len(self._group_masks), boolean=True)
@@ -406,16 +405,21 @@ def _gen_constraints(self, X, y):
         return constraints
 
 
-class GroupedL2L0(GroupedL0):
+class GroupedL2L0(MixedL0, GroupedL0):
     """
-    Estimator with L1L0 regularization solved with mixed integer programming
-    proposed by Peichen Zhong.
-
-    Regularized model is:
-        ||X * Beta - y||^2 + alpha * l0_ratio * ||Beta||_0
-                           + alpha * (1 - l0_ratio) * ||Beta||^2_2
+    Estimator with grouped L2L0 regularization solved with mixed integer programming
     """
 
+    def __init__(self, groups, alpha=1.0, l0_ratio=0.5, big_M=1000, hierarchy=None,
+                 ignore_psd_check=True, fit_intercept=False, normalize=False,
+                 copy_X=True, warm_start=False, solver=None, **kwargs):
+        # need to call super for sklearn clone function
+        super().__init__(groups=groups, alpha=alpha, l0_ratio=l0_ratio, big_M=big_M,
+                         hierarchy=hierarchy, ignore_psd_check=ignore_psd_check,
+                         fit_intercept=fit_intercept,
+                         normalize=normalize, copy_X=copy_X,
+                         warm_start=warm_start, solver=solver, **kwargs)
+
     def _gen_objective(self, X, y):
         """Generate the objective function used in l2l0 regression model"""
         c0 = 2 * X.shape[0]  # keeps hyperparameter scale independent

From 616ea5a3949ef084d40191e3b89ec641b371a450 Mon Sep 17 00:00:00 2001
From: lbluque <lbluque@berkeley.edu>
Date: Sun, 15 May 2022 17:20:38 -0700
Subject: [PATCH 05/11] add top level imports for grouped l0

---
 sparselm/model/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sparselm/model/__init__.py b/sparselm/model/__init__.py
index d6ad531..c7f7d4e 100644
--- a/sparselm/model/__init__.py
+++ b/sparselm/model/__init__.py
@@ -3,7 +3,8 @@
 from sparselm.model.ols import OrdinaryLeastSquares
 from sparselm.model.lasso import Lasso
 from sparselm.model.miqp.best_subset import BestSubsetSelection
-from sparselm.model.miqp.regularized_l0 import L1L0, L2L0, RegularizedL0
+from sparselm.model.miqp.regularized_l0 import L1L0, L2L0, RegularizedL0, GroupedL0, \
+    GroupedL2L0
 from sparselm.model.lasso import Lasso, GroupLasso, OverlapGroupLasso, SparseGroupLasso
 from sparselm.model.adaptive_lasso import AdaptiveLasso, AdaptiveGroupLasso, \
     AdaptiveOverlapGroupLasso, AdaptiveSparseGroupLasso
@@ -15,6 +16,8 @@
     "RegularizedL0",
     "L1L0",
     "L2L0",
+    "GroupedL0",
+    "GroupedL2L0",
     "GroupLasso",
     "OverlapGroupLasso",
     "SparseGroupLasso",

From 5a0b36e11f9761d6a3ec094206ba4c3863f5702d Mon Sep 17 00:00:00 2001
From: lbluque <lbluque@berkeley.edu>
Date: Mon, 16 May 2022 17:50:59 -0700
Subject: [PATCH 06/11] grouped best subset

---
 sparselm/model/miqp/best_subset.py | 78 +++++++++++++++++++++++++++++-
 1 file changed, 76 insertions(+), 2 deletions(-)

diff --git a/sparselm/model/miqp/best_subset.py b/sparselm/model/miqp/best_subset.py
index 61322cf..98c3736 100644
--- a/sparselm/model/miqp/best_subset.py
+++ b/sparselm/model/miqp/best_subset.py
@@ -5,13 +5,12 @@
 
 __author__ = "Luis Barroso-Luque"
 
-
+import numpy as np
 import cvxpy as cp
 from cvxpy.atoms.affine.wraps import psd_wrap
 from sparselm.model.base import CVXEstimator
 
 
-# TODO implement with l1 and l2 regularization grouped with less slack vars and equality constraints
 class BestSubsetSelection(CVXEstimator):
     """MIQP Best Subset Selection estimator
 
@@ -120,3 +119,78 @@ def _gen_hierarchy_constraints(self):
         return [self._z0[high_id] <= self._z0[sub_id]
                 for high_id, sub_ids in enumerate(self.hierarchy)
                 for sub_id in sub_ids]
+
+
+class BestGroupedSubsetSelection(BestSubsetSelection):
+
+    def __init__(self, groups, sparse_bound, big_M=1000, hierarchy=None,
+                 ignore_psd_check=True, fit_intercept=False, normalize=False,
+                 copy_X=True, warm_start=False, solver=None, **kwargs):
+        """
+
+        Args:
+            groups (list or ndarray):
+                array-like of integers specifying groups. Length should be the
+                same as model, where each integer entry specifies the group
+                each parameter corresponds to.
+            sparse_bound (int):
+                Upper bound on sparsity. The upper bound on total number of
+                nonzero coefficients.
+            big_M (float):
+                Upper bound on the norm of coefficients associated with each
+                cluster (groups of coefficients) ||Beta_c||_2
+            hierarchy (list):
+                A list of lists of integers storing hierarchy relations between
+                coefficients.
+                Each sublist contains indices of other coefficients
+                on which the coefficient associated with each element of
+                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
+                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
+                dependence.
+            ignore_psd_check (bool):
+                Wether to ignore cvxpy's PSD checks  of matrix used in quadratic
+                form. Default is True to avoid raising errors for poorly
+                conditioned matrices. But if you want to be strict set to False.
+            fit_intercept (bool):
+                Whether the intercept should be estimated or not.
+                If False, the data is assumed to be already centered.
+            normalize (bool):
+                This parameter is ignored when fit_intercept is set to False.
+                If True, the regressors X will be normalized before regression
+                by subtracting the mean and dividing by the l2-norm.
+                If you wish to standardize, please use StandardScaler before
+                calling fit on an estimator with normalize=False
+            copy_X (bool):
+                If True, X will be copied; else, it may be overwritten.
+            warm_start (bool):
+                When set to True, reuse the solution of the previous call to
+                fit as initialization, otherwise, just erase the previous
+                solution.
+            solver (str):
+                cvxpy backend solver to use. Supported solvers are:
+                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
+                GLPK and GLPK_MI (via CVXOPT GLPK interface)
+            **kwargs:
+                Kewyard arguments passed to cvxpy solve.
+                See docs linked above for more information.
+        """
+        super().__init__(
+            sparse_bound=sparse_bound, big_M=big_M, hierarchy=hierarchy,
+            ignore_psd_check=ignore_psd_check, fit_intercept=fit_intercept,
+            normalize=normalize, copy_X=copy_X, warm_start=warm_start, solver=solver,
+            **kwargs
+        )
+        self.groups = np.asarray(groups)
+        self._group_masks = [self.groups == i for i in np.unique(groups)]
+        self._z0 = cp.Variable(len(self._group_masks), boolean=True)
+
+    def _gen_constraints(self, X, y):
+        """Generate the constraints used to solve l0 regularization"""
+        constraints = []
+        for i, mask in enumerate(self._group_masks):
+            constraints += [self._big_M * self._z0[i] >= self._beta[mask],
+                            self._big_M * self._z0[i] >= -self._beta[mask]]
+
+        if self.hierarchy is not None:
+            constraints += self._gen_hierarchy_constraints()
+        return constraints

From a6cb80ba42984b26994e4312578711c887d0d18e Mon Sep 17 00:00:00 2001
From: lbluque <lbluque@berkeley.edu>
Date: Mon, 16 May 2022 18:06:13 -0700
Subject: [PATCH 07/11] ridged best subset

---
 sparselm/model/miqp/best_subset.py | 72 ++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/sparselm/model/miqp/best_subset.py b/sparselm/model/miqp/best_subset.py
index 98c3736..6454488 100644
--- a/sparselm/model/miqp/best_subset.py
+++ b/sparselm/model/miqp/best_subset.py
@@ -121,6 +121,78 @@ def _gen_hierarchy_constraints(self):
                 for sub_id in sub_ids]
 
 
+class RidgedBestSubsetSelection(BestSubsetSelection):
+
+    def __init__(self, sparse_bound, alpha=1.0, big_M=1000, hierarchy=None,
+                 ignore_psd_check=True, fit_intercept=False, normalize=False,
+                 copy_X=True, warm_start=False, solver=None, **kwargs):
+        """
+        Args:
+            sparse_bound (int):
+                Upper bound on sparsity. The upper bound on total number of
+                nonzero coefficients.
+            big_M (float):
+                Upper bound on the norm of coefficients associated with each
+                cluster (groups of coefficients) ||Beta_c||_2
+            hierarchy (list):
+                A list of lists of integers storing hierarchy relations between
+                coefficients.
+                Each sublist contains indices of other coefficients
+                on which the coefficient associated with each element of
+                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
+                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
+                dependence.
+            ignore_psd_check (bool):
+                Wether to ignore cvxpy's PSD checks  of matrix used in quadratic
+                form. Default is True to avoid raising errors for poorly
+                conditioned matrices. But if you want to be strict set to False.
+            fit_intercept (bool):
+                Whether the intercept should be estimated or not.
+                If False, the data is assumed to be already centered.
+            normalize (bool):
+                This parameter is ignored when fit_intercept is set to False.
+                If True, the regressors X will be normalized before regression
+                by subtracting the mean and dividing by the l2-norm.
+                If you wish to standardize, please use StandardScaler before
+                calling fit on an estimator with normalize=False
+            copy_X (bool):
+                If True, X will be copied; else, it may be overwritten.
+            warm_start (bool):
+                When set to True, reuse the solution of the previous call to
+                fit as initialization, otherwise, just erase the previous
+                solution.
+            solver (str):
+                cvxpy backend solver to use. Supported solvers are:
+                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
+                GLPK and GLPK_MI (via CVXOPT GLPK interface)
+            **kwargs:
+                Kewyard arguments passed to cvxpy solve.
+                See docs linked above for more information.
+        """
+        super().__init__(
+            sparse_bound=sparse_bound, big_M=big_M, hierarchy=hierarchy,
+            ignore_psd_check=ignore_psd_check, fit_intercept=fit_intercept,
+            normalize=normalize, copy_X=copy_X, warm_start=warm_start, solver=solver,
+            **kwargs
+        )
+        self._alpha = cp.Parameter(nonneg=True, value=alpha)
+
+    @property
+    def alpha(self):
+        return self._alpha.value
+
+    @alpha.setter
+    def alpha(self, val):
+        self._alpha.value = val
+
+    def _gen_objective(self, X, y):
+        """Generate the objective function used in l2l0 regression model"""
+        c0 = 2 * X.shape[0]  # keeps hyperparameter scale independent
+        objective = super()._gen_objective(X, y) \
+            + c0 * self._alpha * cp.sum_squares(self._beta)
+        return objective
+
+
 class BestGroupedSubsetSelection(BestSubsetSelection):
 
     def __init__(self, groups, sparse_bound, big_M=1000, hierarchy=None,

From b00ea704f35fe7c683b098fd4841ea9c989f685a Mon Sep 17 00:00:00 2001
From: lbluque <lbluque@berkeley.edu>
Date: Mon, 16 May 2022 18:20:30 -0700
Subject: [PATCH 08/11] ridged best group

---
 sparselm/model/miqp/best_subset.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/sparselm/model/miqp/best_subset.py b/sparselm/model/miqp/best_subset.py
index 6454488..358302e 100644
--- a/sparselm/model/miqp/best_subset.py
+++ b/sparselm/model/miqp/best_subset.py
@@ -193,7 +193,7 @@ def _gen_objective(self, X, y):
         return objective
 
 
-class BestGroupedSubsetSelection(BestSubsetSelection):
+class BestGroupSelection(BestSubsetSelection):
 
     def __init__(self, groups, sparse_bound, big_M=1000, hierarchy=None,
                  ignore_psd_check=True, fit_intercept=False, normalize=False,
@@ -266,3 +266,23 @@ def _gen_constraints(self, X, y):
         if self.hierarchy is not None:
             constraints += self._gen_hierarchy_constraints()
         return constraints
+
+
+class RidgedBestGroupSelection(RidgedBestSubsetSelection, BestGroupSelection):
+
+    def __init__(self, groups, sparse_bound, alpha=1.0, big_M=1000, hierarchy=None,
+                 ignore_psd_check=True, fit_intercept=False, normalize=False,
+                 copy_X=True, warm_start=False, solver=None, **kwargs):
+        # need to call super for sklearn clone function
+        super().__init__(
+            groups=groups, sparse_bound=sparse_bound,  alpha=alpha, big_M=big_M,
+            hierarchy=hierarchy, ignore_psd_check=ignore_psd_check,
+            fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X,
+            warm_start=warm_start, solver=solver, **kwargs
+        )
+
+    def _gen_objective(self, X, y):
+        RidgedBestSubsetSelection._gen_objective(self, X, y)
+
+    def _gen_constraints(self, X, y):
+        BestGroupSelection._gen_constraints(self, X, y)

From c41c4f1aebda9bb697452ba8d14c2383853d08f1 Mon Sep 17 00:00:00 2001
From: lbluque <lbluque@berkeley.edu>
Date: Mon, 16 May 2022 18:21:56 -0700
Subject: [PATCH 09/11] top level imports for best_subset estimators

---
 sparselm/model/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sparselm/model/__init__.py b/sparselm/model/__init__.py
index c7f7d4e..6076798 100644
--- a/sparselm/model/__init__.py
+++ b/sparselm/model/__init__.py
@@ -2,7 +2,8 @@
 
 from sparselm.model.ols import OrdinaryLeastSquares
 from sparselm.model.lasso import Lasso
-from sparselm.model.miqp.best_subset import BestSubsetSelection
+from sparselm.model.miqp.best_subset import BestSubsetSelection, BestGroupSelection, \
+    RidgedBestSubsetSelection, RidgedBestGroupSelection
 from sparselm.model.miqp.regularized_l0 import L1L0, L2L0, RegularizedL0, GroupedL0, \
     GroupedL2L0
 from sparselm.model.lasso import Lasso, GroupLasso, OverlapGroupLasso, SparseGroupLasso
@@ -13,6 +14,9 @@
     "OrdinaryLeastSquares",
     "Lasso",
     "BestSubsetSelection",
+    "BestGroupSelection",
+    "RidgedBestSubsetSelection",
+    "RidgedBestGroupSelection",
     "RegularizedL0",
     "L1L0",
     "L2L0",

From 4e922d0d50406ddb264b84f4fe14e79f51dedee5 Mon Sep 17 00:00:00 2001
From: lbluque <lbluque@berkeley.edu>
Date: Thu, 19 May 2022 22:26:42 -0700
Subject: [PATCH 10/11] add docstrings and abstractmethod

---
 sparselm/model/miqp/best_subset.py    | 53 ++++++++++++++++++++-
 sparselm/model/miqp/regularized_l0.py | 68 ++++++++++++++++++++++++---
 2 files changed, 113 insertions(+), 8 deletions(-)

diff --git a/sparselm/model/miqp/best_subset.py b/sparselm/model/miqp/best_subset.py
index 358302e..6e73357 100644
--- a/sparselm/model/miqp/best_subset.py
+++ b/sparselm/model/miqp/best_subset.py
@@ -122,6 +122,7 @@ def _gen_hierarchy_constraints(self):
 
 
 class RidgedBestSubsetSelection(BestSubsetSelection):
+    """Best subset selection estimaor with ridge regularization."""
 
     def __init__(self, sparse_bound, alpha=1.0, big_M=1000, hierarchy=None,
                  ignore_psd_check=True, fit_intercept=False, normalize=False,
@@ -194,12 +195,12 @@ def _gen_objective(self, X, y):
 
 
 class BestGroupSelection(BestSubsetSelection):
+    """Best group selection estimator."""
 
     def __init__(self, groups, sparse_bound, big_M=1000, hierarchy=None,
                  ignore_psd_check=True, fit_intercept=False, normalize=False,
                  copy_X=True, warm_start=False, solver=None, **kwargs):
         """
-
         Args:
             groups (list or ndarray):
                 array-like of integers specifying groups. Length should be the
@@ -269,10 +270,60 @@ def _gen_constraints(self, X, y):
 
 
 class RidgedBestGroupSelection(RidgedBestSubsetSelection, BestGroupSelection):
+    """Best group selection estimator with ridge regularization."""
 
     def __init__(self, groups, sparse_bound, alpha=1.0, big_M=1000, hierarchy=None,
                  ignore_psd_check=True, fit_intercept=False, normalize=False,
                  copy_X=True, warm_start=False, solver=None, **kwargs):
+        """
+        Args:
+            groups (list or ndarray):
+                array-like of integers specifying groups. Length should be the
+                same as model, where each integer entry specifies the group
+                each parameter corresponds to.
+            sparse_bound (int):
+                Upper bound on sparsity. The upper bound on total number of
+                nonzero coefficients.
+            alpha (float):
+                Ridge regularization hyper-parameter.
+            big_M (float):
+                Upper bound on the norm of coefficients associated with each
+                cluster (groups of coefficients) ||Beta_c||_2
+            hierarchy (list):
+                A list of lists of integers storing hierarchy relations between
+                coefficients.
+                Each sublist contains indices of other coefficients
+                on which the coefficient associated with each element of
+                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
+                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
+                dependence.
+            ignore_psd_check (bool):
+                Wether to ignore cvxpy's PSD checks  of matrix used in quadratic
+                form. Default is True to avoid raising errors for poorly
+                conditioned matrices. But if you want to be strict set to False.
+            fit_intercept (bool):
+                Whether the intercept should be estimated or not.
+                If False, the data is assumed to be already centered.
+            normalize (bool):
+                This parameter is ignored when fit_intercept is set to False.
+                If True, the regressors X will be normalized before regression
+                by subtracting the mean and dividing by the l2-norm.
+                If you wish to standardize, please use StandardScaler before
+                calling fit on an estimator with normalize=False
+            copy_X (bool):
+                If True, X will be copied; else, it may be overwritten.
+            warm_start (bool):
+                When set to True, reuse the solution of the previous call to
+                fit as initialization, otherwise, just erase the previous
+                solution.
+            solver (str):
+                cvxpy backend solver to use. Supported solvers are:
+                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
+                GLPK and GLPK_MI (via CVXOPT GLPK interface)
+            **kwargs:
+                Kewyard arguments passed to cvxpy solve.
+                See docs linked above for more information.
+        """
         # need to call super for sklearn clone function
         super().__init__(
             groups=groups, sparse_bound=sparse_bound,  alpha=alpha, big_M=big_M,
diff --git a/sparselm/model/miqp/regularized_l0.py b/sparselm/model/miqp/regularized_l0.py
index 0e70438..20bf8fc 100644
--- a/sparselm/model/miqp/regularized_l0.py
+++ b/sparselm/model/miqp/regularized_l0.py
@@ -12,7 +12,7 @@
 
 
 import warnings
-from abc import ABCMeta
+from abc import ABCMeta, abstractmethod
 import numpy as np
 import cvxpy as cp
 from cvxpy.atoms.affine.wraps import psd_wrap
@@ -20,8 +20,8 @@
 
 
 class RegularizedL0(CVXEstimator):
-    """Implementation of l0 regularized estimator.
-    """
+    """Implementation of l0 regularized estimator."""
+
     def __init__(self, alpha=1.0, big_M=1000, hierarchy=None, ignore_psd_check=True,
                  fit_intercept=False, normalize=False, copy_X=True, warm_start=False,
                  solver=None, **kwargs):
@@ -125,8 +125,8 @@ def _gen_hierarchy_constraints(self):
 
 
 class MixedL0(RegularizedL0, metaclass=ABCMeta):
-    """Abstract base class for mixed L0 regularization models: L1L0 and L2L0.
-    """
+    """Abstract base class for mixed L0 regularization models: L1L0 and L2L0."""
+
     def __init__(self, alpha=1.0, l0_ratio=0.5, big_M=1000, hierarchy=None,
                  ignore_psd_check=True, fit_intercept=False, normalize=False,
                  copy_X=True, warm_start=False, solver=None, **kwargs):
@@ -211,6 +211,11 @@ def l0_ratio(self, val):
         self._lambda0.value = val * self.alpha
         self._lambda1.value = (1 - val) * self.alpha
 
+    @abstractmethod
+    def _gen_objective(self, X, y):
+        """Generate optimization objective."""
+        return
+
 
 class L1L0(MixedL0):
     """
@@ -324,8 +329,9 @@ def _gen_objective(self, X, y):
         return objective
 
 
-class GroupedL0(RegularizedL0, metaclass=ABCMeta):
-    """Grouped L0 norm"""
+class GroupedL0(RegularizedL0):
+    """Esimator with grouped L0 psuedo-norm regularization."""
+
     def __init__(self, groups, alpha=1.0, big_M=1000, hierarchy=None,
                  ignore_psd_check=True, fit_intercept=False, normalize=False,
                  copy_X=True, warm_start=False, solver=None, **kwargs):
@@ -413,6 +419,54 @@ class GroupedL2L0(MixedL0, GroupedL0):
     def __init__(self, groups, alpha=1.0, l0_ratio=0.5, big_M=1000, hierarchy=None,
                  ignore_psd_check=True, fit_intercept=False, normalize=False,
                  copy_X=True, warm_start=False, solver=None, **kwargs):
+        """
+        Args:
+            groups (list or ndarray):
+                array-like of integers specifying groups. Length should be the
+                same as model, where each integer entry specifies the group
+                each parameter corresponds to.
+            alpha (float):
+                Regularization hyper-parameter.
+            l0_ratio (float):
+                Mixing parameter between l1 and l0 regularization.
+            big_M (float):
+                Upper bound on the norm of coefficients associated with each
+                cluster (groups of coefficients) ||Beta_c||_2
+            hierarchy (list):
+                A list of lists of integers storing hierarchy relations between
+                coefficients.
+                Each sublist contains indices of other coefficients
+                on which the coefficient associated with each element of
+                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
+                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
+                dependence.
+            ignore_psd_check (bool):
+                Wether to ignore cvxpy's PSD checks  of matrix used in quadratic
+                form. Default is True to avoid raising errors for poorly
+                conditioned matrices. But if you want to be strict set to False.
+            fit_intercept (bool):
+                Whether the intercept should be estimated or not.
+                If False, the data is assumed to be already centered.
+            normalize (bool):
+                This parameter is ignored when fit_intercept is set to False.
+                If True, the regressors X will be normalized before regression
+                by subtracting the mean and dividing by the l2-norm.
+                If you wish to standardize, please use StandardScaler before
+                calling fit on an estimator with normalize=False
+            copy_X (bool):
+                If True, X will be copied; else, it may be overwritten.
+            warm_start (bool):
+                When set to True, reuse the solution of the previous call to
+                fit as initialization, otherwise, just erase the previous
+                solution.
+            solver (str):
+                cvxpy backend solver to use. Supported solvers are:
+                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
+                GLPK and GLPK_MI (via CVXOPT GLPK interface)
+            **kwargs:
+                Kewyard arguments passed to cvxpy solve.
+                See docs linked above for more information.
+        """
         # need to call super for sklearn clone function
         super().__init__(groups=groups, alpha=alpha, l0_ratio=l0_ratio, big_M=big_M,
                          hierarchy=hierarchy, ignore_psd_check=ignore_psd_check,

From 41ea49a8428a2d10381bacf8eec6e0aba42b1ae5 Mon Sep 17 00:00:00 2001
From: lbluque <lbluque@berkeley.edu>
Date: Mon, 23 May 2022 11:33:25 -0700
Subject: [PATCH 11/11] update README.md and docs

---
 README.md                             | 5 ++++-
 sparselm/model/miqp/best_subset.py    | 6 +++---
 sparselm/model/miqp/regularized_l0.py | 4 +++-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 7edea4e..0d310a3 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,10 @@ Available regression models
 - Lasso (`sklearn` may be a better option)
 - Group Lasso, Overlap Group Lasso & Sparse Group Lasso
 - Adaptive versions of Lasso, Group Lasso, Overlap Group Lasso & Sparse Group Lasso
-- Best subset selection, L1L0 & L2L0 (we recommend using `gurobi` for performance)
+- Best subset selection, ridged best subset, L0, L1L0 & L2L0
+  (`gurobi` recommended for performance)
+- Best group selection, ridged best group selection,  grouped L0, grouped L2L0
+  (`gurobi` recommended for performance)
 
 Installation
 ------------
diff --git a/sparselm/model/miqp/best_subset.py b/sparselm/model/miqp/best_subset.py
index 6e73357..1b92b06 100644
--- a/sparselm/model/miqp/best_subset.py
+++ b/sparselm/model/miqp/best_subset.py
@@ -122,7 +122,7 @@ def _gen_hierarchy_constraints(self):
 
 
 class RidgedBestSubsetSelection(BestSubsetSelection):
-    """Best subset selection estimaor with ridge regularization."""
+    """MIQP  Best subset selection estimator with ridge regularization."""
 
     def __init__(self, sparse_bound, alpha=1.0, big_M=1000, hierarchy=None,
                  ignore_psd_check=True, fit_intercept=False, normalize=False,
@@ -144,7 +144,7 @@ def __init__(self, sparse_bound, alpha=1.0, big_M=1000, hierarchy=None,
                 coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
                 dependence.
             ignore_psd_check (bool):
-                Wether to ignore cvxpy's PSD checks  of matrix used in quadratic
+                Whether to ignore cvxpy's PSD checks  of matrix used in quadratic
                 form. Default is True to avoid raising errors for poorly
                 conditioned matrices. But if you want to be strict set to False.
             fit_intercept (bool):
@@ -195,7 +195,7 @@ def _gen_objective(self, X, y):
 
 
 class BestGroupSelection(BestSubsetSelection):
-    """Best group selection estimator."""
+    """MIQP Best group selection estimator."""
 
     def __init__(self, groups, sparse_bound, big_M=1000, hierarchy=None,
                  ignore_psd_check=True, fit_intercept=False, normalize=False,
diff --git a/sparselm/model/miqp/regularized_l0.py b/sparselm/model/miqp/regularized_l0.py
index 20bf8fc..b8863a5 100644
--- a/sparselm/model/miqp/regularized_l0.py
+++ b/sparselm/model/miqp/regularized_l0.py
@@ -20,7 +20,7 @@
 
 
 class RegularizedL0(CVXEstimator):
-    """Implementation of l0 regularized estimator."""
+    """Implementation of MIQP l0 regularized estimator."""
 
     def __init__(self, alpha=1.0, big_M=1000, hierarchy=None, ignore_psd_check=True,
                  fit_intercept=False, normalize=False, copy_X=True, warm_start=False,
@@ -315,6 +315,8 @@ class L2L0(MixedL0):
     Estimator with L2L0 regularization solved with mixed integer programming
     proposed by Peichen Zhong.
 
+    https://arxiv.org/abs/2204.13789
+
     Regularized model is:
         ||X * Beta - y||^2 + alpha * l0_ratio * ||Beta||_0
                            + alpha * (1 - l0_ratio) * ||Beta||^2_2