diff --git a/README.md b/README.md
index 7edea4e..0d310a3 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,10 @@ Available regression models
 - Lasso (`sklearn` may be a better option)
 - Group Lasso, Overlap Group Lasso & Sparse Group Lasso
 - Adaptive versions of Lasso, Group Lasso, Overlap Group Lasso & Sparse Group Lasso
-- Best subset selection, L1L0 & L2L0 (we recommend using `gurobi` for performance)
+- Best subset selection, ridged best subset, L0, L1L0 & L2L0
+  (`gurobi` recommended for performance)
+- Best group selection, ridged best group selection,  grouped L0, grouped L2L0
+  (`gurobi` recommended for performance)
 
 Installation
 ------------
diff --git a/sparselm/model/__init__.py b/sparselm/model/__init__.py
index 00b4557..6076798 100644
--- a/sparselm/model/__init__.py
+++ b/sparselm/model/__init__.py
@@ -2,21 +2,31 @@
 
 from sparselm.model.ols import OrdinaryLeastSquares
 from sparselm.model.lasso import Lasso
-from sparselm.model.mixedL0 import L1L0, L2L0
+from sparselm.model.miqp.best_subset import BestSubsetSelection, BestGroupSelection, \
+    RidgedBestSubsetSelection, RidgedBestGroupSelection
+from sparselm.model.miqp.regularized_l0 import L1L0, L2L0, RegularizedL0, GroupedL0, \
+    GroupedL2L0
 from sparselm.model.lasso import Lasso, GroupLasso, OverlapGroupLasso, SparseGroupLasso
 from sparselm.model.adaptive_lasso import AdaptiveLasso, AdaptiveGroupLasso, \
     AdaptiveOverlapGroupLasso, AdaptiveSparseGroupLasso
 
 __all__ = [
-    'OrdinaryLeastSquares',
-    'Lasso',
-    'L1L0',
-    'L2L0',
-    'GroupLasso',
-    'OverlapGroupLasso',
-    'SparseGroupLasso',
-    'AdaptiveLasso',
-    'AdaptiveGroupLasso',
-    'AdaptiveOverlapGroupLasso',
-    'AdaptiveSparseGroupLasso',
+    "OrdinaryLeastSquares",
+    "Lasso",
+    "BestSubsetSelection",
+    "BestGroupSelection",
+    "RidgedBestSubsetSelection",
+    "RidgedBestGroupSelection",
+    "RegularizedL0",
+    "L1L0",
+    "L2L0",
+    "GroupedL0",
+    "GroupedL2L0",
+    "GroupLasso",
+    "OverlapGroupLasso",
+    "SparseGroupLasso",
+    "AdaptiveLasso",
+    "AdaptiveGroupLasso",
+    "AdaptiveOverlapGroupLasso",
+    "AdaptiveSparseGroupLasso",
 ]
diff --git a/sparselm/model/best_subset.py b/sparselm/model/best_subset.py
deleted file mode 100644
index 882c235..0000000
--- a/sparselm/model/best_subset.py
+++ /dev/null
@@ -1,121 +0,0 @@
-"""MIQP based solvers for Best Subset Selection solutions.
-
-Allows hierarchy constraints similar to mixed L0 solvers.
-"""
-
-__author__ = "Luis Barroso-Luque"
-
-
-import cvxpy as cp
-from cvxpy.atoms.affine.wraps import psd_wrap
-from .base import CVXEstimator
-
-
-class BestSubsetSelection(CVXEstimator):
-    """MIQP Best Subset Selection estimator
-
-    WARNING: Even with gurobi solver, this can take a very long time to
-    converge for large problems and underdetermined problems.
-    """
-
-    def __init__(self, sparse_bound, big_M=1000, hierarchy=None,
-                 ignore_psd_check=True, fit_intercept=False, normalize=False,
-                 copy_X=True, warm_start=False, solver=None, **kwargs):
-        """
-
-        Args:
-            sparse_bound (int):
-                Upper bound on sparsity. The upper bound on total number of
-                nonzero coefficients.
-            big_M (float):
-                Upper bound on the norm of coefficients associated with each
-                cluster (groups of coefficients) ||Beta_c||_2
-            hierarchy (list):
-                A list of lists of integers storing hierarchy relations between
-                coefficients.
-                Each sublist contains indices of other coefficients
-                on which the coefficient associated with each element of
-                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
-                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
-                dependence.
-            ignore_psd_check (bool):
-                Wether to ignore cvxpy's PSD checks  of matrix used in quadratic
-                form. Default is True to avoid raising errors for poorly
-                conditioned matrices. But if you want to be strict set to False.
-            fit_intercept (bool):
-                Whether the intercept should be estimated or not.
-                If False, the data is assumed to be already centered.
-            normalize (bool):
-                This parameter is ignored when fit_intercept is set to False.
-                If True, the regressors X will be normalized before regression
-                by subtracting the mean and dividing by the l2-norm.
-                If you wish to standardize, please use StandardScaler before
-                calling fit on an estimator with normalize=False
-            copy_X (bool):
-                If True, X will be copied; else, it may be overwritten.
-            warm_start (bool):
-                When set to True, reuse the solution of the previous call to
-                fit as initialization, otherwise, just erase the previous
-                solution.
-            solver (str):
-                cvxpy backend solver to use. Supported solvers are:
-                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
-                GLPK and GLPK_MI (via CVXOPT GLPK interface)
-            **kwargs:
-                Kewyard arguments passed to cvxpy solve.
-                See docs linked above for more information.
-        """
-        super().__init__(fit_intercept=fit_intercept, normalize=normalize,
-                         copy_X=copy_X, warm_start=warm_start, solver=solver,
-                         **kwargs)
-
-        self._bound = cp.Parameter(nonneg=True, value=sparse_bound)
-        self.hierarchy = hierarchy
-        self._big_M = cp.Parameter(nonneg=True, value=big_M)
-        self.ignore_psd_check = ignore_psd_check
-        self._z0 = None
-
-    @property
-    def sparse_bound(self):
-        return self._bound.value
-
-    @sparse_bound.setter
-    def sparse_bound(self, val):
-        if val <= 0:
-            raise ValueError(f"sparse_bound must be > 0")
-        self._bound.value = val
-
-    @property
-    def big_M(self):
-        return self._big_M.value
-
-    @big_M.setter
-    def big_M(self, val):
-        self._big_M.value = val
-
-    def _gen_objective(self, X, y):
-        """Generate the quadratic form portion of objective"""
-        # psd_wrap will ignore cvxpy PSD checks, without it errors will
-        # likely be raised since correlation matrices are usually very
-        # poorly conditioned
-        XTX = psd_wrap(X.T @ X) if self.ignore_psd_check else X.T @ X
-        objective = cp.quad_form(self._beta, XTX) - 2 * y.T @ X @ self._beta
-        # objective = cp.sum_squares(X @ self._beta - y)
-        return objective
-
-    def _gen_constraints(self, X, y):
-        """Generate the constraints used to solve l0 regularization"""
-        self._z0 = cp.Variable(X.shape[1], boolean=True)
-        constraints = [self._big_M * self._z0 >= self._beta,
-                       self._big_M * self._z0 >= -self._beta,
-                       cp.sum(self._z0) <= self._bound]
-
-        if self.hierarchy is not None:
-            constraints += self._gen_hierarchy_constraints()
-        return constraints
-
-    def _gen_hierarchy_constraints(self):
-        """Generate single feature hierarchy constraints"""
-        return [self._z0[high_id] <= self._z0[sub_id]
-                for high_id, sub_ids in enumerate(self.hierarchy)
-                for sub_id in sub_ids]
diff --git a/sparselm/model/miqp/__init__.py b/sparselm/model/miqp/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sparselm/model/miqp/best_subset.py b/sparselm/model/miqp/best_subset.py
new file mode 100644
index 0000000..1b92b06
--- /dev/null
+++ b/sparselm/model/miqp/best_subset.py
@@ -0,0 +1,339 @@
+"""MIQP based solvers for Best Subset Selection solutions.
+
+Allows hierarchy constraints similar to mixed L0 solvers.
+"""
+
+__author__ = "Luis Barroso-Luque"
+
+import numpy as np
+import cvxpy as cp
+from cvxpy.atoms.affine.wraps import psd_wrap
+from sparselm.model.base import CVXEstimator
+
+
+class BestSubsetSelection(CVXEstimator):
+    """MIQP Best Subset Selection estimator
+
+    WARNING: Even with gurobi solver, this can take a very long time to
+    converge for large problems and underdetermined problems.
+    """
+
+    def __init__(self, sparse_bound, big_M=1000, hierarchy=None,
+                 ignore_psd_check=True, fit_intercept=False, normalize=False,
+                 copy_X=True, warm_start=False, solver=None, **kwargs):
+        """
+
+        Args:
+            sparse_bound (int):
+                Upper bound on sparsity. The upper bound on total number of
+                nonzero coefficients.
+            big_M (float):
+                Upper bound on the norm of coefficients associated with each
+                cluster (groups of coefficients) ||Beta_c||_2
+            hierarchy (list):
+                A list of lists of integers storing hierarchy relations between
+                coefficients.
+                Each sublist contains indices of other coefficients
+                on which the coefficient associated with each element of
+                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
+                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
+                dependence.
+            ignore_psd_check (bool):
+                Wether to ignore cvxpy's PSD checks  of matrix used in quadratic
+                form. Default is True to avoid raising errors for poorly
+                conditioned matrices. But if you want to be strict set to False.
+            fit_intercept (bool):
+                Whether the intercept should be estimated or not.
+                If False, the data is assumed to be already centered.
+            normalize (bool):
+                This parameter is ignored when fit_intercept is set to False.
+                If True, the regressors X will be normalized before regression
+                by subtracting the mean and dividing by the l2-norm.
+                If you wish to standardize, please use StandardScaler before
+                calling fit on an estimator with normalize=False
+            copy_X (bool):
+                If True, X will be copied; else, it may be overwritten.
+            warm_start (bool):
+                When set to True, reuse the solution of the previous call to
+                fit as initialization, otherwise, just erase the previous
+                solution.
+            solver (str):
+                cvxpy backend solver to use. Supported solvers are:
+                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
+                GLPK and GLPK_MI (via CVXOPT GLPK interface)
+            **kwargs:
+                Kewyard arguments passed to cvxpy solve.
+                See docs linked above for more information.
+        """
+        super().__init__(fit_intercept=fit_intercept, normalize=normalize,
+                         copy_X=copy_X, warm_start=warm_start, solver=solver,
+                         **kwargs)
+
+        self._bound = cp.Parameter(nonneg=True, value=sparse_bound)
+        self.hierarchy = hierarchy
+        self._big_M = cp.Parameter(nonneg=True, value=big_M)
+        self.ignore_psd_check = ignore_psd_check
+        self._z0 = None
+
+    @property
+    def sparse_bound(self):
+        return self._bound.value
+
+    @sparse_bound.setter
+    def sparse_bound(self, val):
+        if val <= 0:
+            raise ValueError(f"sparse_bound must be > 0")
+        self._bound.value = val
+
+    @property
+    def big_M(self):
+        return self._big_M.value
+
+    @big_M.setter
+    def big_M(self, val):
+        self._big_M.value = val
+
+    def _gen_objective(self, X, y):
+        """Generate the quadratic form portion of objective"""
+        # psd_wrap will ignore cvxpy PSD checks, without it errors will
+        # likely be raised since correlation matrices are usually very
+        # poorly conditioned
+        XTX = psd_wrap(X.T @ X) if self.ignore_psd_check else X.T @ X
+        objective = cp.quad_form(self._beta, XTX) - 2 * y.T @ X @ self._beta
+        # objective = cp.sum_squares(X @ self._beta - y)
+        return objective
+
+    def _gen_constraints(self, X, y):
+        """Generate the constraints used to solve l0 regularization"""
+        self._z0 = cp.Variable(X.shape[1], boolean=True)
+        constraints = [self._big_M * self._z0 >= self._beta,
+                       self._big_M * self._z0 >= -self._beta,
+                       cp.sum(self._z0) <= self._bound]
+
+        if self.hierarchy is not None:
+            constraints += self._gen_hierarchy_constraints()
+        return constraints
+
+    def _gen_hierarchy_constraints(self):
+        """Generate single feature hierarchy constraints"""
+        return [self._z0[high_id] <= self._z0[sub_id]
+                for high_id, sub_ids in enumerate(self.hierarchy)
+                for sub_id in sub_ids]
+
+
+class RidgedBestSubsetSelection(BestSubsetSelection):
+    """MIQP  Best subset selection estimator with ridge regularization."""
+
+    def __init__(self, sparse_bound, alpha=1.0, big_M=1000, hierarchy=None,
+                 ignore_psd_check=True, fit_intercept=False, normalize=False,
+                 copy_X=True, warm_start=False, solver=None, **kwargs):
+        """
+        Args:
+            sparse_bound (int):
+                Upper bound on sparsity. The upper bound on total number of
+                nonzero coefficients.
+            big_M (float):
+                Upper bound on the norm of coefficients associated with each
+                cluster (groups of coefficients) ||Beta_c||_2
+            hierarchy (list):
+                A list of lists of integers storing hierarchy relations between
+                coefficients.
+                Each sublist contains indices of other coefficients
+                on which the coefficient associated with each element of
+                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
+                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
+                dependence.
+            ignore_psd_check (bool):
+                Whether to ignore cvxpy's PSD checks  of matrix used in quadratic
+                form. Default is True to avoid raising errors for poorly
+                conditioned matrices. But if you want to be strict set to False.
+            fit_intercept (bool):
+                Whether the intercept should be estimated or not.
+                If False, the data is assumed to be already centered.
+            normalize (bool):
+                This parameter is ignored when fit_intercept is set to False.
+                If True, the regressors X will be normalized before regression
+                by subtracting the mean and dividing by the l2-norm.
+                If you wish to standardize, please use StandardScaler before
+                calling fit on an estimator with normalize=False
+            copy_X (bool):
+                If True, X will be copied; else, it may be overwritten.
+            warm_start (bool):
+                When set to True, reuse the solution of the previous call to
+                fit as initialization, otherwise, just erase the previous
+                solution.
+            solver (str):
+                cvxpy backend solver to use. Supported solvers are:
+                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
+                GLPK and GLPK_MI (via CVXOPT GLPK interface)
+            **kwargs:
+                Kewyard arguments passed to cvxpy solve.
+                See docs linked above for more information.
+        """
+        super().__init__(
+            sparse_bound=sparse_bound, big_M=big_M, hierarchy=hierarchy,
+            ignore_psd_check=ignore_psd_check, fit_intercept=fit_intercept,
+            normalize=normalize, copy_X=copy_X, warm_start=warm_start, solver=solver,
+            **kwargs
+        )
+        self._alpha = cp.Parameter(nonneg=True, value=alpha)
+
+    @property
+    def alpha(self):
+        return self._alpha.value
+
+    @alpha.setter
+    def alpha(self, val):
+        self._alpha.value = val
+
+    def _gen_objective(self, X, y):
+        """Generate the objective function used in l2l0 regression model"""
+        c0 = 2 * X.shape[0]  # keeps hyperparameter scale independent
+        objective = super()._gen_objective(X, y) \
+            + c0 * self._alpha * cp.sum_squares(self._beta)
+        return objective
+
+
+class BestGroupSelection(BestSubsetSelection):
+    """MIQP Best group selection estimator."""
+
+    def __init__(self, groups, sparse_bound, big_M=1000, hierarchy=None,
+                 ignore_psd_check=True, fit_intercept=False, normalize=False,
+                 copy_X=True, warm_start=False, solver=None, **kwargs):
+        """
+        Args:
+            groups (list or ndarray):
+                array-like of integers specifying groups. Length should be the
+                same as model, where each integer entry specifies the group
+                each parameter corresponds to.
+            sparse_bound (int):
+                Upper bound on sparsity. The upper bound on total number of
+                nonzero coefficients.
+            big_M (float):
+                Upper bound on the norm of coefficients associated with each
+                cluster (groups of coefficients) ||Beta_c||_2
+            hierarchy (list):
+                A list of lists of integers storing hierarchy relations between
+                coefficients.
+                Each sublist contains indices of other coefficients
+                on which the coefficient associated with each element of
+                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
+                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
+                dependence.
+            ignore_psd_check (bool):
+                Wether to ignore cvxpy's PSD checks  of matrix used in quadratic
+                form. Default is True to avoid raising errors for poorly
+                conditioned matrices. But if you want to be strict set to False.
+            fit_intercept (bool):
+                Whether the intercept should be estimated or not.
+                If False, the data is assumed to be already centered.
+            normalize (bool):
+                This parameter is ignored when fit_intercept is set to False.
+                If True, the regressors X will be normalized before regression
+                by subtracting the mean and dividing by the l2-norm.
+                If you wish to standardize, please use StandardScaler before
+                calling fit on an estimator with normalize=False
+            copy_X (bool):
+                If True, X will be copied; else, it may be overwritten.
+            warm_start (bool):
+                When set to True, reuse the solution of the previous call to
+                fit as initialization, otherwise, just erase the previous
+                solution.
+            solver (str):
+                cvxpy backend solver to use. Supported solvers are:
+                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
+                GLPK and GLPK_MI (via CVXOPT GLPK interface)
+            **kwargs:
+                Kewyard arguments passed to cvxpy solve.
+                See docs linked above for more information.
+        """
+        super().__init__(
+            sparse_bound=sparse_bound, big_M=big_M, hierarchy=hierarchy,
+            ignore_psd_check=ignore_psd_check, fit_intercept=fit_intercept,
+            normalize=normalize, copy_X=copy_X, warm_start=warm_start, solver=solver,
+            **kwargs
+        )
+        self.groups = np.asarray(groups)
+        self._group_masks = [self.groups == i for i in np.unique(groups)]
+        self._z0 = cp.Variable(len(self._group_masks), boolean=True)
+
+    def _gen_constraints(self, X, y):
+        """Generate the constraints used to solve l0 regularization"""
+        constraints = []
+        for i, mask in enumerate(self._group_masks):
+            constraints += [self._big_M * self._z0[i] >= self._beta[mask],
+                            self._big_M * self._z0[i] >= -self._beta[mask]]
+
+        if self.hierarchy is not None:
+            constraints += self._gen_hierarchy_constraints()
+        return constraints
+
+
+class RidgedBestGroupSelection(RidgedBestSubsetSelection, BestGroupSelection):
+    """Best group selection estimator with ridge regularization."""
+
+    def __init__(self, groups, sparse_bound, alpha=1.0, big_M=1000, hierarchy=None,
+                 ignore_psd_check=True, fit_intercept=False, normalize=False,
+                 copy_X=True, warm_start=False, solver=None, **kwargs):
+        """
+        Args:
+            groups (list or ndarray):
+                array-like of integers specifying groups. Length should be the
+                same as model, where each integer entry specifies the group
+                each parameter corresponds to.
+            sparse_bound (int):
+                Upper bound on sparsity. The upper bound on total number of
+                nonzero coefficients.
+            alpha (float):
+                Ridge regularization hyper-parameter.
+            big_M (float):
+                Upper bound on the norm of coefficients associated with each
+                cluster (groups of coefficients) ||Beta_c||_2
+            hierarchy (list):
+                A list of lists of integers storing hierarchy relations between
+                coefficients.
+                Each sublist contains indices of other coefficients
+                on which the coefficient associated with each element of
+                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
+                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
+                dependence.
+            ignore_psd_check (bool):
+                Wether to ignore cvxpy's PSD checks  of matrix used in quadratic
+                form. Default is True to avoid raising errors for poorly
+                conditioned matrices. But if you want to be strict set to False.
+            fit_intercept (bool):
+                Whether the intercept should be estimated or not.
+                If False, the data is assumed to be already centered.
+            normalize (bool):
+                This parameter is ignored when fit_intercept is set to False.
+                If True, the regressors X will be normalized before regression
+                by subtracting the mean and dividing by the l2-norm.
+                If you wish to standardize, please use StandardScaler before
+                calling fit on an estimator with normalize=False
+            copy_X (bool):
+                If True, X will be copied; else, it may be overwritten.
+            warm_start (bool):
+                When set to True, reuse the solution of the previous call to
+                fit as initialization, otherwise, just erase the previous
+                solution.
+            solver (str):
+                cvxpy backend solver to use. Supported solvers are:
+                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
+                GLPK and GLPK_MI (via CVXOPT GLPK interface)
+            **kwargs:
+                Kewyard arguments passed to cvxpy solve.
+                See docs linked above for more information.
+        """
+        # need to call super for sklearn clone function
+        super().__init__(
+            groups=groups, sparse_bound=sparse_bound,  alpha=alpha, big_M=big_M,
+            hierarchy=hierarchy, ignore_psd_check=ignore_psd_check,
+            fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X,
+            warm_start=warm_start, solver=solver, **kwargs
+        )
+
+    def _gen_objective(self, X, y):
+        RidgedBestSubsetSelection._gen_objective(self, X, y)
+
+    def _gen_constraints(self, X, y):
+        BestGroupSelection._gen_constraints(self, X, y)
diff --git a/sparselm/model/miqp/regularized_l0.py b/sparselm/model/miqp/regularized_l0.py
new file mode 100644
index 0000000..b8863a5
--- /dev/null
+++ b/sparselm/model/miqp/regularized_l0.py
@@ -0,0 +1,485 @@
+"""MIQP based solvers for sparse solutions with hierarchical constraints
+
+Mixed L1L0 and L2L0 solvers.
+L1L0 proposed by Wenxuan Huang: https://arxiv.org/abs/1807.10753
+L2L0 proposed by Peichen Zhong
+
+Estimators allow optional inclusion of hierarchical at the single feature
+single coefficient level.
+"""
+
+__author__ = "Luis Barroso-Luque, Fengyu Xie"
+
+
+import warnings
+from abc import ABCMeta, abstractmethod
+import numpy as np
+import cvxpy as cp
+from cvxpy.atoms.affine.wraps import psd_wrap
+from sparselm.model.base import CVXEstimator
+
+
+class RegularizedL0(CVXEstimator):
+    """Implementation of MIQP l0 regularized estimator."""
+
+    def __init__(self, alpha=1.0, big_M=1000, hierarchy=None, ignore_psd_check=True,
+                 fit_intercept=False, normalize=False, copy_X=True, warm_start=False,
+                 solver=None, **kwargs):
+        """
+        Args:
+            alpha (float):
+                Regularization hyper-parameter.
+            big_M (float):
+                Upper bound on the norm of coefficients associated with each
+                cluster (groups of coefficients) ||Beta_c||_2
+            hierarchy (list):
+                A list of lists of integers storing hierarchy relations between
+                coefficients.
+                Each sublist contains indices of other coefficients
+                on which the coefficient associated with each element of
+                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
+                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
+                dependence.
+            ignore_psd_check (bool):
+                Wether to ignore cvxpy's PSD checks  of matrix used in quadratic
+                form. Default is True to avoid raising errors for poorly
+                conditioned matrices. But if you want to be strict set to False.
+            fit_intercept (bool):
+                Whether the intercept should be estimated or not.
+                If False, the data is assumed to be already centered.
+            normalize (bool):
+                This parameter is ignored when fit_intercept is set to False.
+                If True, the regressors X will be normalized before regression
+                by subtracting the mean and dividing by the l2-norm.
+                If you wish to standardize, please use StandardScaler before
+                calling fit on an estimator with normalize=False
+            copy_X (bool):
+                If True, X will be copied; else, it may be overwritten.
+            warm_start (bool):
+                When set to True, reuse the solution of the previous call to
+                fit as initialization, otherwise, just erase the previous
+                solution.
+            solver (str):
+                cvxpy backend solver to use. Supported solvers are:
+                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
+                GLPK and GLPK_MI (via CVXOPT GLPK interface)
+            **kwargs:
+                Kewyard arguments passed to cvxpy solve.
+                See docs linked above for more information.
+        """
+        super().__init__(fit_intercept=fit_intercept, normalize=normalize,
+                         copy_X=copy_X, warm_start=warm_start, solver=solver,
+                         **kwargs)
+
+        self.hierarchy = hierarchy
+        self._alpha = alpha
+        self._lambda0 = cp.Parameter(nonneg=True, value=alpha)
+        self._big_M = cp.Parameter(nonneg=True, value=big_M)
+        self.ignore_psd_check = ignore_psd_check
+        self._z0 = None
+
+    @property
+    def alpha(self):
+        return self._alpha
+
+    @alpha.setter
+    def alpha(self, val):
+        self._alpha = val
+        self._lambda0.value = val
+
+    @property
+    def big_M(self):
+        return self._big_M.value
+
+    @big_M.setter
+    def big_M(self, val):
+        self._big_M.value = val
+
+    def _gen_objective(self, X, y):
+        """Generate the quadratic form portion of objective"""
+        # psd_wrap will ignore cvxpy PSD checks, without it errors will
+        # likely be raised since correlation matrices are usually very
+        # poorly conditioned
+        self._z0 = cp.Variable(X.shape[1], boolean=True)
+        c0 = 2 * X.shape[0] # keeps hyperparameter scale independent
+        XTX = psd_wrap(X.T @ X) if self.ignore_psd_check else X.T @ X
+
+        objective = cp.quad_form(self._beta, XTX) - 2 * y.T @ X @ self._beta \
+            + c0 * self._lambda0 * cp.sum(self._z0)
+        return objective
+
+    def _gen_constraints(self, X, y):
+        """Generate the constraints used to solve l0 regularization"""
+        constraints = [self._big_M * self._z0 >= self._beta,
+                       self._big_M * self._z0 >= -self._beta]
+
+        if self.hierarchy is not None:
+            constraints += self._gen_hierarchy_constraints()
+        return constraints
+
+    def _gen_hierarchy_constraints(self):
+        """Generate single feature hierarchy constraints"""
+        return [self._z0[high_id] <= self._z0[sub_id]
+                for high_id, sub_ids in enumerate(self.hierarchy)
+                for sub_id in sub_ids]
+
+
+class MixedL0(RegularizedL0, metaclass=ABCMeta):
+    """Abstract base class for mixed L0 regularization models: L1L0 and L2L0."""
+
+    def __init__(self, alpha=1.0, l0_ratio=0.5, big_M=1000, hierarchy=None,
+                 ignore_psd_check=True, fit_intercept=False, normalize=False,
+                 copy_X=True, warm_start=False, solver=None, **kwargs):
+        """
+        Args:
+            alpha (float):
+                Regularization hyper-parameter.
+            l0_ratio (float):
+                Mixing parameter between l1 and l0 regularization.
+            big_M (float):
+                Upper bound on the norm of coefficients associated with each
+                cluster (groups of coefficients) ||Beta_c||_2
+            hierarchy (list):
+                A list of lists of integers storing hierarchy relations between
+                coefficients.
+                Each sublist contains indices of other coefficients
+                on which the coefficient associated with each element of
+                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
+                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
+                dependence.
+            ignore_psd_check (bool):
+                Wether to ignore cvxpy's PSD checks  of matrix used in quadratic
+                form. Default is True to avoid raising errors for poorly
+                conditioned matrices. But if you want to be strict set to False.
+            fit_intercept (bool):
+                Whether the intercept should be estimated or not.
+                If False, the data is assumed to be already centered.
+            normalize (bool):
+                This parameter is ignored when fit_intercept is set to False.
+                If True, the regressors X will be normalized before regression
+                by subtracting the mean and dividing by the l2-norm.
+                If you wish to standardize, please use StandardScaler before
+                calling fit on an estimator with normalize=False
+            copy_X (bool):
+                If True, X will be copied; else, it may be overwritten.
+            warm_start (bool):
+                When set to True, reuse the solution of the previous call to
+                fit as initialization, otherwise, just erase the previous
+                solution.
+            solver (str):
+                cvxpy backend solver to use. Supported solvers are:
+                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
+                GLPK and GLPK_MI (via CVXOPT GLPK interface)
+            **kwargs:
+                Kewyard arguments passed to cvxpy solve.
+                See docs linked above for more information.
+        """
+        super().__init__(
+            alpha=alpha, big_M=big_M, hierarchy=hierarchy,
+            ignore_psd_check=ignore_psd_check, fit_intercept=fit_intercept,
+            normalize=normalize, copy_X=copy_X, warm_start=warm_start, solver=solver,
+            **kwargs
+        )
+
+        if not 0 <= l0_ratio <= 1:
+            raise ValueError('l0_ratio must be between 0 and 1.')
+        elif l0_ratio == 0.0:
+            warnings.warn(
+                "It's more efficient to use Ridge/Lasso instead of l0_ratio=0",
+                UserWarning)
+
+        self._lambda0.value = l0_ratio * alpha
+        self._lambda1 = cp.Parameter(nonneg=True, value=(1 - l0_ratio) * alpha)
+        #  save exact value so sklearn clone is happy dappy
+        self._l0_ratio = l0_ratio
+
+    @RegularizedL0.alpha.setter
+    def alpha(self, val):
+        self._alpha = val
+        self._lambda0.value = self.l0_ratio * val
+        self._lambda1.value = (1 - self.l0_ratio) * val
+
+    @property
+    def l0_ratio(self):
+        return self._l0_ratio
+
+    @l0_ratio.setter
+    def l0_ratio(self, val):
+        if not 0 <= val <= 1:
+            raise ValueError('l0_ratio must be between 0 and 1.')
+        self._l0_ratio = val
+        self._lambda0.value = val * self.alpha
+        self._lambda1.value = (1 - val) * self.alpha
+
+    @abstractmethod
+    def _gen_objective(self, X, y):
+        """Generate optimization objective."""
+        return
+
+
+class L1L0(MixedL0):
+    """
+    Estimator with L1L0 regularization solved with mixed integer programming
+    as discussed in:
+    https://arxiv.org/abs/1807.10753
+
+    Removed Gurobi dependency!
+    Uses any solver it has in hand. If nothing, will use ECOS_BB.
+    ECOS_BB is only supported after cvxpy-1.1.6. It will not be given
+    automatically pulled up by cvxpy, and has to be explicitly called
+    passed in the constructor with solver='ECOS_BB'.
+
+    Installation of Gurobi is no longer a must, but highly recommended.
+    You can get a free academic gurobi license...
+    ECOS_BB also works but can be very slow.
+
+    Regularized model is:
+        ||X * Beta - y||^2 + alpha * l0_ratio * ||Beta||_0
+                           + alpha * (1 - l0_ratio) * ||Beta||_1
+    """
+    def __init__(self, alpha=1.0, l0_ratio=0.5, big_M=1000, hierarchy=None,
+                 ignore_psd_check=True, fit_intercept=False, normalize=False,
+                 copy_X=True, warm_start=False, solver=None, **kwargs):
+        """
+        Args:
+            alpha (float):
+                Regularization hyper-parameter.
+            l0_ratio (float):
+                Mixing parameter between l1 and l0 regularization.
+            big_M (float):
+                Upper bound on the norm of coefficients associated with each
+                cluster (groups of coefficients) ||Beta_c||_2
+            hierarchy (list):
+                A list of lists of integers storing hierarchy relations between
+                coefficients.
+                Each sublist contains indices of other coefficients
+                on which the coefficient associated with each element of
+                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
+                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
+                dependence.
+            ignore_psd_check (bool):
+                Wether to ignore cvxpy's PSD checks of matrix used in quadratic
+                form. Default is True to avoid raising errors for poorly
+                conditioned matrices. But if you want to be strict set to False.
+            fit_intercept (bool):
+                Whether the intercept should be estimated or not.
+                If False, the data is assumed to be already centered.
+            normalize (bool):
+                This parameter is ignored when fit_intercept is set to False.
+                If True, the regressors X will be normalized before regression
+                by subtracting the mean and dividing by the l2-norm.
+                If you wish to standardize, please use StandardScaler before
+                calling fit on an estimator with normalize=False
+            copy_X (bool):
+                If True, X will be copied; else, it may be overwritten.
+            warm_start (bool):
+                When set to True, reuse the solution of the previous call to
+                fit as initialization, otherwise, just erase the previous
+                solution.
+            solver (str):
+                cvxpy backend solver to use. Supported solvers are:
+                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
+                GLPK and GLPK_MI (via CVXOPT GLPK interface)
+            **kwargs:
+                Kewyard arguments passed to cvxpy solve.
+                See docs linked above for more information.
+        """
+        super().__init__(alpha=alpha, l0_ratio=l0_ratio, big_M=big_M,
+                         hierarchy=hierarchy, ignore_psd_check=ignore_psd_check,
+                         fit_intercept=fit_intercept,
+                         normalize=normalize, copy_X=copy_X,
+                         warm_start=warm_start, solver=solver, **kwargs)
+        self._z1 = None
+
+    def _gen_constraints(self, X, y):
+        """Generate the constraints used to solve l1l0 regularization"""
+        constraints = super()._gen_constraints(X, y)
+        # L1 constraints (why not do an l1 norm in the objective instead?)
+        constraints += [self._z1 >= self._beta,
+                        self._z1 >= -1.0 * self._beta]
+
+        return constraints
+
+    def _gen_objective(self, X, y):
+        """Generate the objective function used in l1l0 regression model"""
+        self._z1 = cp.Variable(X.shape[1])
+        c0 = 2 * X.shape[0] # keeps hyperparameter scale independent
+        objective = super()._gen_objective(X, y) \
+            + c0 * self._lambda1 * cp.sum(self._z1)
+
+        return objective
+
+
+class L2L0(MixedL0):
+    """
+    Estimator with L2L0 regularization solved with mixed integer programming
+    proposed by Peichen Zhong.
+
+    https://arxiv.org/abs/2204.13789
+
+    Regularized model is:
+        ||X * Beta - y||^2 + alpha * l0_ratio * ||Beta||_0
+                           + alpha * (1 - l0_ratio) * ||Beta||^2_2
+    """
+
+    def _gen_objective(self, X, y):
+        """Generate the objective function used in l2l0 regression model"""
+        c0 = 2 * X.shape[0]  # keeps hyperparameter scale independent
+        objective = super()._gen_objective(X, y) \
+            + c0 * self._lambda1 * cp.sum_squares(self._beta)
+
+        return objective
+
+
+class GroupedL0(RegularizedL0):
+    """Esimator with grouped L0 psuedo-norm regularization."""
+
+    def __init__(self, groups, alpha=1.0, big_M=1000, hierarchy=None,
+                 ignore_psd_check=True, fit_intercept=False, normalize=False,
+                 copy_X=True, warm_start=False, solver=None, **kwargs):
+        """
+        Args:
+            groups (list or ndarray):
+                array-like of integers specifying groups. Length should be the
+                same as model, where each integer entry specifies the group
+                each parameter corresponds to.
+            alpha (float):
+                Regularization hyper-parameter.
+            big_M (float):
+                Upper bound on the norm of coefficients associated with each
+                cluster (groups of coefficients) ||Beta_c||_2
+            hierarchy (list):
+                A list of lists of integers storing hierarchy relations between
+                coefficients.
+                Each sublist contains indices of other coefficients
+                on which the coefficient associated with each element of
+                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
+                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
+                dependence.
+            ignore_psd_check (bool):
+                Wether to ignore cvxpy's PSD checks  of matrix used in quadratic
+                form. Default is True to avoid raising errors for poorly
+                conditioned matrices. But if you want to be strict set to False.
+            fit_intercept (bool):
+                Whether the intercept should be estimated or not.
+                If False, the data is assumed to be already centered.
+            normalize (bool):
+                This parameter is ignored when fit_intercept is set to False.
+                If True, the regressors X will be normalized before regression
+                by subtracting the mean and dividing by the l2-norm.
+                If you wish to standardize, please use StandardScaler before
+                calling fit on an estimator with normalize=False
+            copy_X (bool):
+                If True, X will be copied; else, it may be overwritten.
+            warm_start (bool):
+                When set to True, reuse the solution of the previous call to
+                fit as initialization, otherwise, just erase the previous
+                solution.
+            solver (str):
+                cvxpy backend solver to use. Supported solvers are:
+                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
+                GLPK and GLPK_MI (via CVXOPT GLPK interface)
+            **kwargs:
+                Kewyard arguments passed to cvxpy solve.
+                See docs linked above for more information.
+        """
+        super().__init__(
+            alpha=alpha, big_M=big_M, hierarchy=hierarchy,
+            ignore_psd_check=ignore_psd_check, fit_intercept=fit_intercept,
+            normalize=normalize, copy_X=copy_X, warm_start=warm_start, solver=solver,
+            **kwargs)
+
+        self.groups = np.asarray(groups)
+        self._group_masks = [self.groups == i for i in np.unique(groups)]
+        self._z0 = cp.Variable(len(self._group_masks), boolean=True)
+
+    def _gen_objective(self, X, y):
+        """Generate the quadratic form portion of objective"""
+        c0 = 2 * X.shape[0] # keeps hyperparameter scale independent
+        XTX = psd_wrap(X.T @ X) if self.ignore_psd_check else X.T @ X
+        objective = cp.quad_form(self._beta, XTX) - 2 * y.T @ X @ self._beta \
+            + c0 * self._lambda0 * cp.sum(self._z0)
+        return objective
+
+    def _gen_constraints(self, X, y):
+        """Generate the constraints used to solve l0 regularization"""
+        constraints = []
+        for i, mask in enumerate(self._group_masks):
+            constraints += [self._big_M * self._z0[i] >= self._beta[mask],
+                            self._big_M * self._z0[i] >= -self._beta[mask]]
+
+        if self.hierarchy is not None:
+            constraints += self._gen_hierarchy_constraints()
+        return constraints
+
+
+class GroupedL2L0(MixedL0, GroupedL0):
+    """
+    Estimator with grouped L2L0 regularization solved with mixed integer programming
+    """
+
+    def __init__(self, groups, alpha=1.0, l0_ratio=0.5, big_M=1000, hierarchy=None,
+                 ignore_psd_check=True, fit_intercept=False, normalize=False,
+                 copy_X=True, warm_start=False, solver=None, **kwargs):
+        """
+        Args:
+            groups (list or ndarray):
+                array-like of integers specifying groups. Length should be the
+                same as model, where each integer entry specifies the group
+                each parameter corresponds to.
+            alpha (float):
+                Regularization hyper-parameter.
+            l0_ratio (float):
+                Mixing parameter between l1 and l0 regularization.
+            big_M (float):
+                Upper bound on the norm of coefficients associated with each
+                cluster (groups of coefficients) ||Beta_c||_2
+            hierarchy (list):
+                A list of lists of integers storing hierarchy relations between
+                coefficients.
+                Each sublist contains indices of other coefficients
+                on which the coefficient associated with each element of
+                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
+                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
+                dependence.
+            ignore_psd_check (bool):
+                Wether to ignore cvxpy's PSD checks  of matrix used in quadratic
+                form. Default is True to avoid raising errors for poorly
+                conditioned matrices. But if you want to be strict set to False.
+            fit_intercept (bool):
+                Whether the intercept should be estimated or not.
+                If False, the data is assumed to be already centered.
+            normalize (bool):
+                This parameter is ignored when fit_intercept is set to False.
+                If True, the regressors X will be normalized before regression
+                by subtracting the mean and dividing by the l2-norm.
+                If you wish to standardize, please use StandardScaler before
+                calling fit on an estimator with normalize=False
+            copy_X (bool):
+                If True, X will be copied; else, it may be overwritten.
+            warm_start (bool):
+                When set to True, reuse the solution of the previous call to
+                fit as initialization, otherwise, just erase the previous
+                solution.
+            solver (str):
+                cvxpy backend solver to use. Supported solvers are:
+                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
+                GLPK and GLPK_MI (via CVXOPT GLPK interface)
+            **kwargs:
+                Kewyard arguments passed to cvxpy solve.
+                See docs linked above for more information.
+        """
+        # need to call super for sklearn clone function
+        super().__init__(groups=groups, alpha=alpha, l0_ratio=l0_ratio, big_M=big_M,
+                         hierarchy=hierarchy, ignore_psd_check=ignore_psd_check,
+                         fit_intercept=fit_intercept,
+                         normalize=normalize, copy_X=copy_X,
+                         warm_start=warm_start, solver=solver, **kwargs)
+
+    def _gen_objective(self, X, y):
+        """Generate the objective function used in l2l0 regression model"""
+        c0 = 2 * X.shape[0]  # keeps hyperparameter scale independent
+        objective = super()._gen_objective(X, y) \
+            + c0 * self._lambda1 * cp.sum_squares(self._beta)
+
+        return objective
diff --git a/sparselm/model/mixedL0.py b/sparselm/model/mixedL0.py
deleted file mode 100644
index 09d6c9a..0000000
--- a/sparselm/model/mixedL0.py
+++ /dev/null
@@ -1,260 +0,0 @@
-"""MIQP based solvers for sparse solutions with hierarchical constraints
-
-Mixed L1L0 and L2L0 solvers.
-L1L0 proposed by Wenxuan Huang: https://arxiv.org/abs/1807.10753
-L2L0 proposed by Peichen Zhong
-
-Estimators allow optional inclusion of hierarchical at the single feature
-single coefficient level.
-"""
-
-__author__ = "Luis Barroso-Luque, Fengyu Xie"
-
-
-import warnings
-from abc import ABCMeta
-import cvxpy as cp
-from cvxpy.atoms.affine.wraps import psd_wrap
-from sparselm.model.base import CVXEstimator
-
-
-class mixedL0(CVXEstimator, metaclass=ABCMeta):
-    """Abstract base class for mixed L0 regularization models: L1L0 and L2L0.
-    """
-    def __init__(self, alpha=1.0, l0_ratio=0.5, big_M=1000, hierarchy=None,
-                 ignore_psd_check=True, fit_intercept=False, normalize=False,
-                 copy_X=True, warm_start=False, solver=None, **kwargs):
-        """
-        Args:
-            alpha (float):
-                Regularization hyper-parameter.
-            l0_ratio (float):
-                Mixing parameter between l1 and l0 regularization.
-            big_M (float):
-                Upper bound on the norm of coefficients associated with each
-                cluster (groups of coefficients) ||Beta_c||_2
-            hierarchy (list):
-                A list of lists of integers storing hierarchy relations between
-                coefficients.
-                Each sublist contains indices of other coefficients
-                on which the coefficient associated with each element of
-                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
-                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
-                dependence.
-            ignore_psd_check (bool):
-                Wether to ignore cvxpy's PSD checks  of matrix used in quadratic
-                form. Default is True to avoid raising errors for poorly
-                conditioned matrices. But if you want to be strict set to False.
-            fit_intercept (bool):
-                Whether the intercept should be estimated or not.
-                If False, the data is assumed to be already centered.
-            normalize (bool):
-                This parameter is ignored when fit_intercept is set to False.
-                If True, the regressors X will be normalized before regression
-                by subtracting the mean and dividing by the l2-norm.
-                If you wish to standardize, please use StandardScaler before
-                calling fit on an estimator with normalize=False
-            copy_X (bool):
-                If True, X will be copied; else, it may be overwritten.
-            warm_start (bool):
-                When set to True, reuse the solution of the previous call to
-                fit as initialization, otherwise, just erase the previous
-                solution.
-            solver (str):
-                cvxpy backend solver to use. Supported solvers are:
-                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
-                GLPK and GLPK_MI (via CVXOPT GLPK interface)
-            **kwargs:
-                Kewyard arguments passed to cvxpy solve.
-                See docs linked above for more information.
-        """
-        super().__init__(fit_intercept=fit_intercept, normalize=normalize,
-                         copy_X=copy_X, warm_start=warm_start, solver=solver,
-                         **kwargs)
-
-        if not 0 <= l0_ratio <= 1:
-            raise ValueError('l0_ratio must be between 0 and 1.')
-        elif l0_ratio == 0.0:
-            warnings.warn(
-                "It's more efficient to use Ridge/Lasso instead of l0_ratio=0",
-                UserWarning)
-
-        self.hierarchy = hierarchy
-        self._alpha = alpha
-        self._big_M = cp.Parameter(nonneg=True, value=big_M)
-        self._lambda0 = cp.Parameter(nonneg=True, value=l0_ratio * alpha)
-        self._lambda1 = cp.Parameter(nonneg=True, value=(1 - l0_ratio) * alpha)
-        #  save exact value so sklearn clone is happy dappy
-        self._l0_ratio = l0_ratio
-        self.ignore_psd_check = ignore_psd_check
-        self._z0 = None
-
-    @property
-    def alpha(self):
-        return self._alpha
-
-    @alpha.setter
-    def alpha(self, val):
-        self._alpha = val
-        self._lambda0.value = self.l0_ratio * val
-        self._lambda1.value = (1 - self.l0_ratio) * val
-
-    @property
-    def big_M(self):
-        return self._big_M.value
-
-    @big_M.setter
-    def big_M(self, val):
-        self._big_M.value = val
-
-    @property
-    def l0_ratio(self):
-        return self._l0_ratio
-
-    @l0_ratio.setter
-    def l0_ratio(self, val):
-        if not 0 <= val <= 1:
-            raise ValueError('l0_ratio must be between 0 and 1.')
-        self._l0_ratio = val
-        self._lambda0.value = val * self.alpha
-        self._lambda1.value = (1 - val) * self.alpha
-
-    def _gen_objective(self, X, y):
-        """Generate the quadratic form portion of objective"""
-        # psd_wrap will ignore cvxpy PSD checks, without it errors will
-        # likely be raised since correlation matrices are usually very
-        # poorly conditioned
-        self._z0 = cp.Variable(X.shape[1], boolean=True)
-        c0 = 2 * X.shape[0] # keeps hyperparameter scale independent
-        XTX = psd_wrap(X.T @ X) if self.ignore_psd_check else X.T @ X
-
-        objective = cp.quad_form(self._beta, XTX) - 2 * y.T @ X @ self._beta \
-            + c0 * self._lambda0 * cp.sum(self._z0)
-        return objective
-
-    def _gen_constraints(self, X, y):
-        """Generate the constraints used to solve l0 regularization"""
-        constraints = [self._big_M * self._z0 >= self._beta,
-                       self._big_M * self._z0 >= -self._beta]
-
-        if self.hierarchy is not None:
-            constraints += self._gen_hierarchy_constraints()
-        return constraints
-
-    def _gen_hierarchy_constraints(self):
-        """Generate single feature hierarchy constraints"""
-        return [self._z0[high_id] <= self._z0[sub_id]
-                for high_id, sub_ids in enumerate(self.hierarchy)
-                for sub_id in sub_ids]
-
-
-class L1L0(mixedL0):
-    """
-    Estimator with L1L0 regularization solved with mixed integer programming
-    as discussed in:
-    https://arxiv.org/abs/1807.10753
-
-    Removed Gurobi dependency!
-    Uses any solver it has in hand. If nothing, will use ECOS_BB.
-    ECOS_BB is only supported after cvxpy-1.1.6. It will not be given
-    automatically pulled up by cvxpy, and has to be explicitly called
-    passed in the constructor with solver='ECOS_BB'.
-
-    Installation of Gurobi is no longer a must, but highly recommended.
-    You can get a free academic gurobi license...
-    ECOS_BB also works but can be very slow.
-
-    Regularized model is:
-        ||X * Beta - y||^2 + alpha * l0_ratio * ||Beta||_0
-                           + alpha * (1 - l0_ratio) * ||Beta||_1
-    """
-    def __init__(self, alpha=1.0, l0_ratio=0.5, big_M=1000, hierarchy=None,
-                 ignore_psd_check=True, fit_intercept=False, normalize=False,
-                 copy_X=True, warm_start=False, solver=None, **kwargs):
-        """
-        Args:
-            alpha (float):
-                Regularization hyper-parameter.
-            l0_ratio (float):
-                Mixing parameter between l1 and l0 regularization.
-            big_M (float):
-                Upper bound on the norm of coefficients associated with each
-                cluster (groups of coefficients) ||Beta_c||_2
-            hierarchy (list):
-                A list of lists of integers storing hierarchy relations between
-                coefficients.
-                Each sublist contains indices of other coefficients
-                on which the coefficient associated with each element of
-                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
-                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
-                dependence.
-            ignore_psd_check (bool):
-                Wether to ignore cvxpy's PSD checks of matrix used in quadratic
-                form. Default is True to avoid raising errors for poorly
-                conditioned matrices. But if you want to be strict set to False.
-            fit_intercept (bool):
-                Whether the intercept should be estimated or not.
-                If False, the data is assumed to be already centered.
-            normalize (bool):
-                This parameter is ignored when fit_intercept is set to False.
-                If True, the regressors X will be normalized before regression
-                by subtracting the mean and dividing by the l2-norm.
-                If you wish to standardize, please use StandardScaler before
-                calling fit on an estimator with normalize=False
-            copy_X (bool):
-                If True, X will be copied; else, it may be overwritten.
-            warm_start (bool):
-                When set to True, reuse the solution of the previous call to
-                fit as initialization, otherwise, just erase the previous
-                solution.
-            solver (str):
-                cvxpy backend solver to use. Supported solvers are:
-                ECOS, ECOS_BB, CVXOPT, SCS, GUROBI, Elemental.
-                GLPK and GLPK_MI (via CVXOPT GLPK interface)
-            **kwargs:
-                Kewyard arguments passed to cvxpy solve.
-                See docs linked above for more information.
-        """
-        super().__init__(alpha=alpha, l0_ratio=l0_ratio, big_M=big_M,
-                         hierarchy=hierarchy, ignore_psd_check=ignore_psd_check,
-                         fit_intercept=fit_intercept,
-                         normalize=normalize, copy_X=copy_X,
-                         warm_start=warm_start, solver=solver, **kwargs)
-        self._z1 = None
-
-    def _gen_constraints(self, X, y):
-        """Generate the constraints used to solve l1l0 regularization"""
-        constraints = super()._gen_constraints(X, y)
-        # L1 constraints (why not do an l1 norm in the objective instead?)
-        constraints += [self._z1 >= self._beta,
-                        self._z1 >= -1.0 * self._beta]
-
-        return constraints
-
-    def _gen_objective(self, X, y):
-        """Generate the objective function used in l1l0 regression model"""
-        self._z1 = cp.Variable(X.shape[1])
-        c0 = 2 * X.shape[0] # keeps hyperparameter scale independent
-        objective = super()._gen_objective(X, y) \
-            + c0 * self._lambda1 * cp.sum(self._z1)
-
-        return objective
-
-
-class L2L0(mixedL0):
-    """
-    Estimator with L1L0 regularization solved with mixed integer programming
-    proposed by Peichen Zhong.
-
-    Regularized model is:
-        ||X * Beta - y||^2 + alpha * l0_ratio * ||Beta||_0
-                           + alpha * (1 - l0_ratio) * ||Beta||^2_2
-    """
-
-    def _gen_objective(self, X, y):
-        """Generate the objective function used in l2l0 regression model"""
-        c0 = 2 * X.shape[0] # keeps hyperparameter scale independent
-        objective = super()._gen_objective(X, y) \
-            + c0 * self._lambda1 * cp.sum_squares(self._beta)
-
-        return objective