diff --git a/.github/workflows/versioning.yaml b/.github/workflows/versioning.yaml
index d2709e6..e23dcb8 100644
--- a/.github/workflows/versioning.yaml
+++ b/.github/workflows/versioning.yaml
@@ -47,7 +47,7 @@ jobs:
         - name: Set up Python
           uses: actions/setup-python@v5
           with:
-            python-version: ${{ matrix.python-version }}
+            python-version: 3.13
         - name: Install package
           run: make install
         - name: Build package
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index e69de29..8ef067d 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -0,0 +1,5 @@
+- bump: minor
+  changes:
+    added:
+    - Add hyperparameter tuning for L0 implementation with option to holdout targets.
+    - Add method to evaluate robustness of calibration to target holdouts.
diff --git a/pyproject.toml b/pyproject.toml
index eb11a84..7dd328f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,6 +14,7 @@ dependencies = [
     "pandas",
     "tqdm",
     "l0-python",
+    "optuna",
 ]
 
 [project.optional-dependencies]
diff --git a/src/microcalibrate/__init__.py b/src/microcalibrate/__init__.py
index 0b8f3fa..ab06bd3 100644
--- a/src/microcalibrate/__init__.py
+++ b/src/microcalibrate/__init__.py
@@ -1,2 +1,7 @@
 from .calibration import Calibration
-from .evaluation import evaluate_estimate_distance_to_targets
+from .evaluation import (
+    evaluate_estimate_distance_to_targets,
+    evaluate_holdout_robustness,
+    evaluate_sparse_weights,
+)
+from .hyperparameter_tuning import tune_l0_hyperparameters
diff --git a/src/microcalibrate/calibration.py b/src/microcalibrate/calibration.py
index a42d76a..3a2a9ef 100644
--- a/src/microcalibrate/calibration.py
+++ b/src/microcalibrate/calibration.py
@@ -1,11 +1,20 @@
 import logging
-from typing import Callable, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 
 import numpy as np
+import optuna
 import pandas as pd
 import torch
 from torch import Tensor
 
+from microcalibrate.evaluation import (
+    evaluate_holdout_robustness as _evaluate_holdout_robustness,
+)
+from microcalibrate.hyperparameter_tuning import (
+    tune_l0_hyperparameters as _tune_l0_hyperparameters,
+)
+from microcalibrate.reweight import reweight
+
 
 class Calibration:
     def __init__(
@@ -14,7 +23,9 @@ def __init__(
         targets: np.ndarray,
         target_names: Optional[np.ndarray] = None,
         estimate_matrix: Optional[pd.DataFrame] = None,
-        estimate_function: Optional[Callable[[Tensor], Tensor]] = None,
+        estimate_function: Optional[
+            Callable[[torch.Tensor], torch.Tensor]
+        ] = None,
         epochs: Optional[int] = 32,
         noise_level: Optional[float] = 10.0,
         learning_rate: Optional[float] = 1e-3,
@@ -23,11 +34,14 @@ def __init__(
         excluded_targets: Optional[List[str]] = None,
         csv_path: Optional[str] = None,
         device: str = "cpu",  # fix to cpu for now to avoid user device-specific issues
-        l0_lambda: float = 5e-6,  # best between 1e-6 and 1e-5
-        init_mean: float = 0.999,  # initial proportion with non-zero weights, set near 0
-        sparse_learning_rate: float = 0.2,
-        temperature: float = 0.5,  # usual values .5 to 3
+        l0_lambda: Optional[float] = 5e-6,  # best between 1e-6 and 1e-5
+        init_mean: Optional[
+            float
+        ] = 0.999,  # initial proportion with non-zero weights, set near 0
+        temperature: Optional[float] = 0.5,  # usual values .5 to 3
+        sparse_learning_rate: Optional[float] = 0.2,
         regularize_with_l0: Optional[bool] = False,
+        seed: Optional[int] = 42,
     ):
         """Initialize the Calibration class.
 
@@ -36,7 +50,7 @@ def __init__(
             targets (np.ndarray): Array of target values.
             target_names (Optional[np.ndarray]): Optional names of the targets for logging. Defaults to None. You MUST pass these names if you are not passing in an estimate matrix, and just passing in an estimate function.
             estimate_matrix (pd.DataFrame): DataFrame containing the estimate matrix.
-            estimate_function (Optional[Callable[[Tensor], Tensor]]): Function to estimate targets from weights. Defaults to None, in which case it will use the estimate_matrix.
+            estimate_function (Optional[Callable[[torch.Tensor], torch.Tensor]]): Function to estimate targets from weights. Defaults to None, in which case it will use the estimate_matrix.
             epochs (int): Optional number of epochs for calibration. Defaults to 32.
             noise_level (float): Optional level of noise to add to weights. Defaults to 10.0.
             learning_rate (float): Optional learning rate for the optimizer. Defaults to 1e-3.
@@ -65,9 +79,9 @@ def __init__(
         self.original_estimate_matrix = estimate_matrix
         self.original_targets = targets
         self.original_target_names = target_names
+        self.original_estimate_function = estimate_function
         self.weights = weights
         self.excluded_targets = excluded_targets
-        self.estimate_function = estimate_function
         self.epochs = epochs
         self.noise_level = noise_level
         self.learning_rate = learning_rate
@@ -81,10 +95,24 @@ def __init__(
         self.temperature = temperature
         self.sparse_learning_rate = sparse_learning_rate
         self.regularize_with_l0 = regularize_with_l0
+        self.seed = seed
+
+        if device is not None:
+            self.device = torch.device(device)
+            torch.manual_seed(self.seed)
+        else:
+            self.device = torch.device(
+                "cuda"
+                if torch.cuda.is_available()
+                else "mps" if torch.mps.is_available() else "cpu"
+            )
+            if self.device == "cuda":
+                torch.cuda.manual_seed(self.seed)
 
         self.estimate_matrix = None
         self.targets = None
         self.target_names = None
+        self.estimate_function = None
         self.excluded_target_data = {}
 
         # Set target names from estimate_matrix if not provided
@@ -107,7 +135,7 @@ def __init__(
             else:
                 self.estimate_matrix = None
 
-        if self.estimate_function is None:
+        if self.original_estimate_function is None:
             if self.estimate_matrix is not None:
                 self.estimate_function = (
                     lambda weights: weights @ self.estimate_matrix
@@ -127,16 +155,12 @@ def calibrate(self) -> None:
 
         self._assess_targets(
             estimate_function=self.estimate_function,
-            estimate_matrix=getattr(
-                self, "original_estimate_matrix", self.estimate_matrix
-            ),
+            estimate_matrix=self.estimate_matrix,
             weights=self.weights,
             targets=self.targets,
             target_names=self.target_names,
         )
 
-        from .reweight import reweight
-
         new_weights, sparse_weights, self.performance_df = reweight(
             original_weights=self.weights,
             estimate_function=self.estimate_function,
@@ -210,9 +234,9 @@ def exclude_targets(
             initial_weights_tensor = torch.tensor(
                 self.weights, dtype=torch.float32, device=self.device
             )
-            if self.estimate_function is not None:
+            if self.original_estimate_function is not None:
                 initial_estimates_all = (
-                    self.estimate_function(initial_weights_tensor)
+                    self.original_estimate_function(initial_weights_tensor)
                     .detach()
                     .cpu()
                     .numpy()
@@ -240,6 +264,10 @@ def exclude_targets(
                     dtype=torch.float32,
                     device=self.device,
                 )
+
+                self.estimate_function = (
+                    lambda weights: weights @ self.estimate_matrix
+                )
             else:
                 raise ValueError(
                     "Either estimate_function or estimate_matrix must be provided"
@@ -261,6 +289,10 @@ def exclude_targets(
                     dtype=torch.float32,
                     device=self.device,
                 )
+                if self.original_estimate_function is None:
+                    self.estimate_function = (
+                        lambda weights: weights @ self.estimate_matrix
+                    )
             else:
                 self.estimate_matrix = None
 
@@ -268,13 +300,13 @@ def exclude_targets(
         self.targets = targets_array
         self.target_names = target_names
 
-    def estimate(self) -> pd.Series:
+    def estimate(self, weights: Optional[np.ndarray] = None) -> pd.Series:
+        if weights is None:
+            weights = self.weights
         return pd.Series(
             index=self.target_names,
             data=self.estimate_function(
-                torch.tensor(
-                    self.weights, dtype=torch.float32, device=self.device
-                )
+                torch.tensor(weights, dtype=torch.float32, device=self.device)
             )
             .cpu()
             .detach()
@@ -283,7 +315,7 @@ def estimate(self) -> pd.Series:
 
     def _assess_targets(
         self,
-        estimate_function: Callable[[Tensor], Tensor],
+        estimate_function: Callable[[torch.Tensor], torch.Tensor],
         estimate_matrix: Optional[pd.DataFrame],
         weights: np.ndarray,
         targets: np.ndarray,
@@ -292,7 +324,7 @@ def _assess_targets(
         """Assess the targets to ensure they do not violate basic requirements like compatibility, correct order of magnitude, etc.
 
         Args:
-            estimate_function (Callable[[Tensor], Tensor]): Function to estimate the targets from weights.
+            estimate_function (Callable[[torch.Tensor], torch.Tensor]): Function to estimate the targets from weights.
             estimate_matrix (Optional[pd.DataFrame]): DataFrame containing the estimate matrix. Defaults to None.
             weights (np.ndarray): Array of original weights.
             targets (np.ndarray): Array of target values.
@@ -315,6 +347,11 @@ def _assess_targets(
                 "Some targets are negative. This may not make sense for totals."
             )
 
+        if estimate_matrix is None and self.excluded_targets is not None:
+            self.logger.warning(
+                "You are excluding targets but not passing an estimate matrix. Make sure the estimate function handles excluded targets correctly, otherwise you may face operand errors."
+            )
+
         # Estimate order of magnitude from column sums and warn if they are off by an order of magnitude from targets
         one_weights = weights * 0 + 1
         estimates = (
@@ -328,6 +365,7 @@ def _assess_targets(
             .numpy()
             .flatten()
         )
+
         # Use a small epsilon to avoid division by zero
         eps = 1e-4
         adjusted_estimates = np.where(estimates == 0, eps, estimates)
@@ -444,7 +482,7 @@ def _get_linear_loss(metrics_matrix, target_vector, sparse=False):
 
     def summary(
         self,
-    ) -> str:
+    ) -> pd.DataFrame:
         """Generate a summary of the calibration process."""
         if self.performance_df is None:
             return "No calibration has been performed yet, make sure to run .calibrate() before requesting a summary."
@@ -468,3 +506,133 @@ def summary(
         ) / df["Official target"]
         df = df.reset_index(drop=True)
         return df
+
+    def tune_l0_hyperparameters(
+        self,
+        n_trials: Optional[int] = 30,
+        objectives_balance: Optional[Dict[str, float]] = None,
+        epochs_per_trial: Optional[int] = None,
+        n_holdout_sets: Optional[int] = 3,
+        holdout_fraction: Optional[float] = 0.2,
+        aggregation: Optional[str] = "mean",
+        timeout: Optional[float] = None,
+        n_jobs: Optional[int] = 1,
+        study_name: Optional[str] = None,
+        storage: Optional[str] = None,
+        load_if_exists: Optional[bool] = False,
+        direction: Optional[str] = "minimize",
+        sampler: Optional["optuna.samplers.BaseSampler"] = None,
+        pruner: Optional["optuna.pruners.BasePruner"] = None,
+    ) -> Dict[str, Any]:
+        """
+        Tune hyperparameters for L0 regularization using Optuna.
+
+        This method optimizes l0_lambda, init_mean, and temperature to achieve:
+        1. Low calibration loss
+        2. High percentage of targets within 10% of their true values
+        3. Sparse weights (fewer non-zero weights)
+
+        Args:
+            n_trials: Number of optimization trials to run.
+            objectives_balance: Dictionary to balance the importance of loss, accuracy, and sparsity in the objective function. Default prioritizes being within 10% of targets.
+            epochs_per_trial: Number of epochs per trial. If None, uses self.epochs // 4.
+            n_holdout_sets: Number of different holdout sets to create and evaluate on
+            holdout_fraction: Fraction of targets in each holdout set
+            aggregation: How to combine scores across holdouts ("mean", "median", "worst")
+            timeout: Stop study after this many seconds. None means no timeout.
+            n_jobs: Number of parallel jobs. -1 means using all processors.
+            study_name: Name of the study for storage.
+            storage: Database URL for distributed optimization.
+            load_if_exists: Whether to load existing study.
+            direction: Optimization direction ('minimize' or 'maximize').
+            sampler: Optuna sampler for hyperparameter suggestions.
+            pruner: Optuna pruner for early stopping of trials.
+
+        Returns:
+            Dictionary containing the best hyperparameters found.
+        """
+        return _tune_l0_hyperparameters(
+            calibration=self,
+            n_trials=n_trials,
+            objectives_balance=objectives_balance,
+            epochs_per_trial=epochs_per_trial,
+            n_holdout_sets=n_holdout_sets,
+            holdout_fraction=holdout_fraction,
+            aggregation=aggregation,
+            timeout=timeout,
+            n_jobs=n_jobs,
+            study_name=study_name,
+            storage=storage,
+            load_if_exists=load_if_exists,
+            direction=direction,
+            sampler=sampler,
+            pruner=pruner,
+        )
+
+    def _create_holdout_sets(
+        self,
+        n_holdout_sets: int,
+        holdout_fraction: float,
+        random_state: Optional[int] = None,
+    ) -> List[Dict[str, Any]]:
+        """Create multiple holdout sets for cross-validation.
+
+        Args:
+            n_holdout_sets: Number of holdout sets to create
+            holdout_fraction: Fraction of targets in each holdout set
+            random_state: Base random seed for reproducibility
+            exclude_excluded: Whether to exclude already excluded targets from the holdout sets
+
+        Returns:
+            List of dictionaries containing holdout names and indices
+        """
+        n_targets = len(self.target_names)
+        n_holdout_targets = max(1, int(n_targets * holdout_fraction))
+
+        holdout_sets = []
+        for i in range(n_holdout_sets):
+            # Each holdout set gets a different random selection
+            set_rng = np.random.default_rng((random_state or self.seed) + i)
+            holdout_indices = set_rng.choice(
+                n_targets, size=n_holdout_targets, replace=False
+            )
+            holdout_names = [self.target_names[idx] for idx in holdout_indices]
+            holdout_sets.append(
+                {"names": holdout_names, "indices": holdout_indices}
+            )
+
+        return holdout_sets
+
+    def evaluate_holdout_robustness(
+        self,
+        n_holdout_sets: Optional[int] = 5,
+        holdout_fraction: Optional[float] = 0.2,
+        save_results_to: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """
+        Evaluate calibration robustness using holdout validation.
+
+        This function assesses how well the calibration generalizes by:
+        1. Repeatedly holding out random subsets of targets
+        2. Calibrating on the remaining targets
+        3. Evaluating performance on held-out targets
+
+        Args:
+            n_holdout_sets (int): Number of different holdout sets to evaluate.
+            More sets provide better estimates but increase computation time.
+            holdout_fraction (float): Fraction of targets to hold out in each set.
+            save_results_to (str): Path to save detailed results as CSV. If None, no saving.
+
+        Returns:
+            Dict[str, Any]: Dictionary containing:
+                - overall_metrics: Summary statistics across all holdouts
+                - target_robustness: DataFrame showing each target's performance when held out
+                - recommendation: String with interpretation and recommendations
+                - detailed_results: (if requested) List of detailed results per holdout
+        """
+        return _evaluate_holdout_robustness(
+            calibration=self,
+            n_holdout_sets=n_holdout_sets,
+            holdout_fraction=holdout_fraction,
+            save_results_to=save_results_to,
+        )
diff --git a/src/microcalibrate/evaluation.py b/src/microcalibrate/evaluation.py
index e485194..138a45a 100644
--- a/src/microcalibrate/evaluation.py
+++ b/src/microcalibrate/evaluation.py
@@ -1,10 +1,13 @@
 import logging
-from typing import List, Optional, Union
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
 import pandas as pd
 import torch
 
+from microcalibrate.utils.metrics import loss, pct_close
+
 logger = logging.getLogger(__name__)
 
 
@@ -131,3 +134,507 @@ def evaluate_sparse_weights(
         logging.info(f"has rel_error: {rel_error[i]:.2f}\n")
     logging.info("---End of reweighting quick diagnostics------")
     return percent_within_10
+
+
+def _evaluate_single_holdout_robustness(
+    calibration,
+    holdout_set: Dict[str, Any],
+    holdout_idx: int,
+    n_holdout_sets: int,
+) -> Optional[Dict[str, Any]]:
+    """Evaluate a single holdout set for robustness analysis.
+
+    Args:
+        calibration: Calibration instance
+        holdout_set: Dictionary with holdout information
+        holdout_idx: Index of current holdout set
+        n_holdout_sets: Total number of holdout sets
+
+    Returns:
+        Dictionary with evaluation results or None if failed
+    """
+    try:
+        logger.info(
+            f"Evaluating holdout set {holdout_idx + 1}/{n_holdout_sets}"
+        )
+
+        # Run calibration on training targets
+        start_time = pd.Timestamp.now()
+        calibration.calibrate()
+        calibration_time = (pd.Timestamp.now() - start_time).total_seconds()
+
+        # Get final weights (sparse if using L0, otherwise regular)
+        final_weights = (
+            calibration.sparse_weights
+            if calibration.sparse_weights is not None
+            else calibration.weights
+        )
+
+        # Evaluate on all targets
+        weights_tensor = torch.tensor(
+            final_weights, dtype=torch.float32, device=calibration.device
+        )
+
+        # Get estimates for all targets using original estimate function/matrix
+        if calibration.original_estimate_matrix is not None:
+            original_matrix_tensor = torch.tensor(
+                calibration.original_estimate_matrix.values,
+                dtype=torch.float32,
+                device=calibration.device,
+            )
+            all_estimates = (
+                (weights_tensor @ original_matrix_tensor).cpu().numpy()
+            )
+        else:
+            all_estimates = (
+                calibration.original_estimate_function(weights_tensor)
+                .cpu()
+                .numpy()
+            )
+
+        # Calculate metrics for holdout vs training sets
+        holdout_indices = holdout_set["indices"]
+        train_indices = [
+            i
+            for i in range(len(calibration.original_target_names))
+            if i not in holdout_indices
+        ]
+
+        holdout_estimates = all_estimates[holdout_indices]
+        holdout_targets = calibration.original_targets[holdout_indices]
+        holdout_names = holdout_set["names"]
+
+        train_estimates = all_estimates[train_indices]
+        train_targets = calibration.original_targets[train_indices]
+
+        # Calculate losses and accuracies
+        holdout_loss = loss(
+            torch.tensor(
+                holdout_estimates,
+                dtype=torch.float32,
+                device=calibration.device,
+            ),
+            torch.tensor(
+                holdout_targets, dtype=torch.float32, device=calibration.device
+            ),
+            None,
+        ).item()
+
+        holdout_accuracy = pct_close(
+            torch.tensor(
+                holdout_estimates,
+                dtype=torch.float32,
+                device=calibration.device,
+            ),
+            torch.tensor(
+                holdout_targets, dtype=torch.float32, device=calibration.device
+            ),
+        )
+
+        train_loss = loss(
+            torch.tensor(
+                train_estimates, dtype=torch.float32, device=calibration.device
+            ),
+            torch.tensor(
+                train_targets, dtype=torch.float32, device=calibration.device
+            ),
+            None,
+        ).item()
+
+        train_accuracy = pct_close(
+            torch.tensor(
+                train_estimates, dtype=torch.float32, device=calibration.device
+            ),
+            torch.tensor(
+                train_targets, dtype=torch.float32, device=calibration.device
+            ),
+        )
+
+        # Calculate per-target metrics for holdout targets
+        target_details = []
+        for idx, name in enumerate(holdout_names):
+            rel_error = (
+                holdout_estimates[idx] - holdout_targets[idx]
+            ) / holdout_targets[idx]
+            target_details.append(
+                {
+                    "target_name": name,
+                    "target_value": holdout_targets[idx],
+                    "estimate": holdout_estimates[idx],
+                    "relative_error": rel_error,
+                    "within_10pct": abs(rel_error) <= 0.1,
+                }
+            )
+
+        generalization_gap = holdout_loss - train_loss
+        accuracy_gap = train_accuracy - holdout_accuracy
+
+        result = {
+            "holdout_set_idx": holdout_idx,
+            "n_holdout_targets": len(holdout_indices),
+            "n_train_targets": len(train_indices),
+            "holdout_loss": holdout_loss,
+            "train_loss": train_loss,
+            "generalization_gap": generalization_gap,
+            "holdout_accuracy": holdout_accuracy,
+            "train_accuracy": train_accuracy,
+            "accuracy_gap": accuracy_gap,
+            "calibration_time_seconds": calibration_time,
+            "holdout_target_names": holdout_names,
+            "target_details": target_details,
+            "weights_sparsity": (
+                np.mean(final_weights == 0)
+                if calibration.sparse_weights is not None
+                else 0
+            ),
+        }
+
+        return result
+
+    except Exception as e:
+        logger.error(f"Error in holdout set {holdout_idx}: {str(e)}")
+        return None
+
+
+def _save_holdout_results(
+    save_path: str,
+    overall_metrics: Dict[str, float],
+    target_robustness_df: pd.DataFrame,
+    detailed_results: List[Dict[str, Any]],
+) -> None:
+    """Save detailed holdout results to CSV files.
+
+    Args:
+        save_path: Path to save results
+        overall_metrics: Overall metrics dictionary
+        target_robustness_df: Target robustness dataframe
+        detailed_results: List of detailed results
+    """
+    save_path = Path(save_path)
+    save_path.parent.mkdir(parents=True, exist_ok=True)
+
+    overall_df = pd.DataFrame([overall_metrics])
+    overall_path = save_path.with_name(f"{save_path.stem}_overall.csv")
+    overall_df.to_csv(overall_path, index=False)
+
+    robustness_path = save_path.with_name(
+        f"{save_path.stem}_target_robustness.csv"
+    )
+    target_robustness_df.to_csv(robustness_path, index=False)
+
+    detailed_data = []
+    for result in detailed_results:
+        for target_detail in result["target_details"]:
+            detailed_data.append(
+                {
+                    "holdout_set_idx": result["holdout_set_idx"],
+                    "target_name": target_detail["target_name"],
+                    "target_value": target_detail["target_value"],
+                    "estimate": target_detail["estimate"],
+                    "relative_error": target_detail["relative_error"],
+                    "within_10pct": target_detail["within_10pct"],
+                    "holdout_loss": result["holdout_loss"],
+                    "train_loss": result["train_loss"],
+                    "generalization_gap": result["generalization_gap"],
+                }
+            )
+
+    detailed_df = pd.DataFrame(detailed_data)
+    detailed_path = save_path.with_name(f"{save_path.stem}_detailed.csv")
+    detailed_df.to_csv(detailed_path, index=False)
+
+
+def _generate_robustness_recommendation(
+    overall_metrics: Dict[str, float],
+    target_robustness_df: pd.DataFrame,
+    regularize_with_l0: bool,
+) -> str:
+    """Generate interpretation and recommendations based on robustness evaluation.
+
+    Args:
+        overall_metrics: Overall metrics dictionary
+        target_robustness_df: Target robustness dataframe
+        regularize_with_l0: Whether L0 regularization is enabled
+
+    Returns:
+        Recommendation string
+    """
+    mean_acc = overall_metrics["mean_holdout_accuracy"]
+    std_acc = overall_metrics["std_holdout_accuracy"]
+    worst_acc = overall_metrics["worst_holdout_accuracy"]
+    gen_gap = overall_metrics["mean_generalization_gap"]
+    problematic_targets = target_robustness_df[
+        target_robustness_df["holdout_accuracy_rate"] < 0.5
+    ]["target_name"].tolist()
+
+    rec_parts = []
+
+    # Overall assessment
+    if mean_acc >= 0.9 and std_acc <= 0.05:
+        rec_parts.append(
+            "✅ EXCELLENT ROBUSTNESS: The calibration generalizes very well."
+        )
+    elif mean_acc >= 0.8 and std_acc <= 0.1:
+        rec_parts.append(
+            "👍 GOOD ROBUSTNESS: The calibration shows good generalization."
+        )
+    elif mean_acc >= 0.7:
+        rec_parts.append(
+            "⚠️ MODERATE ROBUSTNESS: The calibration has decent but improvable generalization."
+        )
+    else:
+        rec_parts.append(
+            "❌ POOR ROBUSTNESS: The calibration shows weak generalization."
+        )
+
+    rec_parts.append(
+        f"\nOn average, {mean_acc:.1%} of held-out targets are within 10% of their true values."
+    )
+
+    # Stability assessment
+    if std_acc > 0.15:
+        rec_parts.append(
+            f"\n ⚠️ High variability (std={std_acc:.1%}) suggests instability across different target combinations."
+        )
+
+    # Worst-case analysis
+    if worst_acc < 0.5:
+        rec_parts.append(
+            f"\n ⚠️ Worst-case scenario: Only {worst_acc:.1%} accuracy in some holdout sets."
+        )
+
+    # Problematic targets
+    if problematic_targets:
+        rec_parts.append(
+            f"\n\n📊 Targets with poor holdout performance (<50% accuracy):"
+        )
+        for target in problematic_targets[:5]:
+            target_data = target_robustness_df[
+                target_robustness_df["target_name"] == target
+            ].iloc[0]
+            rec_parts.append(
+                f"\n  - {target}: {target_data['holdout_accuracy_rate']:.1%} accuracy"
+            )
+
+    rec_parts.append("\n\n💡 RECOMMENDATIONS:")
+
+    if mean_acc < 0.8 or std_acc > 0.1:
+        if regularize_with_l0:
+            rec_parts.append(
+                "\n  1. Consider tuning L0 regularization parameters with tune_hyperparameters()"
+            )
+        else:
+            rec_parts.append(
+                "\n  1. Consider enabling L0 regularization for better generalization"
+            )
+
+        rec_parts.append(
+            "\n  2. Increase the noise_level parameter to improve robustness"
+        )
+        rec_parts.append(
+            "\n  3. Try increasing dropout_rate to reduce overfitting"
+        )
+
+    if problematic_targets:
+        rec_parts.append(
+            f"\n  4. Investigate why these targets are hard to predict: {', '.join(problematic_targets[:3])}"
+        )
+        rec_parts.append(
+            "\n  5. Consider if these targets have sufficient support in the microdata"
+        )
+
+    if gen_gap > 0.01:
+        rec_parts.append(
+            f"\n  6. Generalization gap of {gen_gap:.4f} suggests some overfitting - consider regularization"
+        )
+
+    return "".join(rec_parts)
+
+
+def evaluate_holdout_robustness(
+    calibration,
+    n_holdout_sets: Optional[int] = 5,
+    holdout_fraction: Optional[float] = 0.2,
+    save_results_to: Optional[str] = None,
+) -> Dict[str, Any]:
+    """
+    Evaluate calibration robustness using holdout validation.
+
+    This function assesses how well the calibration generalizes by:
+    1. Repeatedly holding out random subsets of targets
+    2. Calibrating on the remaining targets
+    3. Evaluating performance on held-out targets
+
+    Args:
+        calibration: Calibration instance to evaluate
+        n_holdout_sets: Number of different holdout sets to evaluate.
+            More sets provide better estimates but increase computation time.
+        holdout_fraction: Fraction of targets to hold out in each set.
+        save_results_to: Path to save detailed results as CSV. If None, no saving.
+
+    Returns:
+        Dict[str, Any]: Dictionary containing:
+            - overall_metrics: Summary statistics across all holdouts
+            - target_robustness: DataFrame showing each target's performance when held out
+            - recommendation: String with interpretation and recommendations
+            - detailed_results: (if requested) List of detailed results per holdout
+    """
+    logger.info(
+        f"Starting holdout robustness evaluation with {n_holdout_sets} sets, "
+        f"holding out {holdout_fraction:.1%} of targets each time."
+    )
+
+    logger.warning(
+        "Data leakage warning: Targets often share overlapping information "
+        "(e.g., geographic breakdowns like 'snap in CA' and 'snap in US'). "
+        "Holdout validation may not provide complete isolation between training and validation sets. "
+        "The robustness metrics should be interpreted with this limitation in mind - "
+        "they may overestimate the model's true generalization performance."
+    )
+
+    # Store original state
+    original_state = {
+        "weights": calibration.weights.copy(),
+        "excluded_targets": (
+            calibration.excluded_targets.copy()
+            if calibration.excluded_targets
+            else None
+        ),
+        "targets": calibration.targets.copy(),
+        "target_names": (
+            calibration.target_names.copy()
+            if calibration.target_names is not None
+            else None
+        ),
+        "sparse_weights": (
+            calibration.sparse_weights.copy()
+            if calibration.sparse_weights is not None
+            else None
+        ),
+    }
+
+    # Create holdout sets
+    holdout_sets = calibration._create_holdout_sets(
+        n_holdout_sets, holdout_fraction, calibration.seed + 1
+    )
+
+    # Collect results
+    all_results = []
+    target_performance = {
+        name: {"held_out_losses": [], "held_out_accuracies": []}
+        for name in calibration.original_target_names
+    }
+
+    try:
+        for i in range(n_holdout_sets):
+            holdout_set = holdout_sets[i]
+
+            # Reset to original state
+            calibration.weights = original_state["weights"].copy()
+            calibration.excluded_targets = holdout_set["names"]
+            calibration.exclude_targets()
+
+            result = _evaluate_single_holdout_robustness(
+                calibration, holdout_set, i, n_holdout_sets
+            )
+
+            if result is not None:
+                all_results.append(result)
+
+                # Update target performance tracking
+                for detail in result["target_details"]:
+                    name = detail["target_name"]
+                    target_performance[name]["held_out_losses"].append(
+                        (detail["estimate"] - detail["target_value"]) ** 2
+                    )
+                    target_performance[name]["held_out_accuracies"].append(
+                        detail["within_10pct"]
+                    )
+    finally:
+        # Restore original state
+        for key, value in original_state.items():
+            if value is not None:
+                setattr(
+                    calibration,
+                    key,
+                    value.copy() if hasattr(value, "copy") else value,
+                )
+        if calibration.excluded_targets:
+            calibration.exclude_targets()
+
+    if not all_results:
+        raise ValueError("No successful holdout evaluations completed")
+
+    # Calculate overall metrics
+    holdout_losses = [r["holdout_loss"] for r in all_results]
+    holdout_accuracies = [r["holdout_accuracy"] for r in all_results]
+    train_losses = [r["train_loss"] for r in all_results]
+    train_accuracies = [r["train_accuracy"] for r in all_results]
+    generalization_gaps = [r["generalization_gap"] for r in all_results]
+
+    overall_metrics = {
+        "mean_holdout_loss": np.mean(holdout_losses),
+        "std_holdout_loss": np.std(holdout_losses),
+        "mean_holdout_accuracy": np.mean(holdout_accuracies),
+        "std_holdout_accuracy": np.std(holdout_accuracies),
+        "worst_holdout_accuracy": np.min(holdout_accuracies),
+        "best_holdout_accuracy": np.max(holdout_accuracies),
+        "mean_train_loss": np.mean(train_losses),
+        "mean_train_accuracy": np.mean(train_accuracies),
+        "mean_generalization_gap": np.mean(generalization_gaps),
+        "std_generalization_gap": np.std(generalization_gaps),
+        "n_successful_evaluations": len(all_results),
+        "n_failed_evaluations": n_holdout_sets - len(all_results),
+    }
+
+    target_robustness_data = []
+    for target_name in calibration.original_target_names:
+        perf = target_performance[target_name]
+        if perf[
+            "held_out_losses"
+        ]:  # Only include if target was held out at least once
+            target_robustness_data.append(
+                {
+                    "target_name": target_name,
+                    "times_held_out": len(perf["held_out_losses"]),
+                    "mean_holdout_loss": np.mean(perf["held_out_losses"]),
+                    "std_holdout_loss": np.std(perf["held_out_losses"]),
+                    "holdout_accuracy_rate": np.mean(
+                        perf["held_out_accuracies"]
+                    ),
+                }
+            )
+
+    target_robustness_df = pd.DataFrame(target_robustness_data)
+    target_robustness_df = target_robustness_df.sort_values(
+        "holdout_accuracy_rate", ascending=True
+    )
+
+    # Generate recommendations
+    recommendation = _generate_robustness_recommendation(
+        overall_metrics, target_robustness_df, calibration.regularize_with_l0
+    )
+
+    # Save results if requested
+    if save_results_to:
+        _save_holdout_results(
+            save_results_to, overall_metrics, target_robustness_df, all_results
+        )
+
+    results = {
+        "overall_metrics": overall_metrics,
+        "target_robustness": target_robustness_df,
+        "recommendation": recommendation,
+        "detailed_results": all_results,
+    }
+
+    logger.info(
+        f"\nHoldout evaluation completed:"
+        f"\n  Mean holdout accuracy: {overall_metrics['mean_holdout_accuracy']:.2%} "
+        f"(±{overall_metrics['std_holdout_accuracy']:.2%})"
+        f"\n  Worst-case accuracy: {overall_metrics['worst_holdout_accuracy']:.2%}"
+        f"\n  Generalization gap: {overall_metrics['mean_generalization_gap']:.6f}"
+        f"\n  Least robust targets: {', '.join(target_robustness_df.head(5)['target_name'].tolist())}"
+    )
+
+    return results
diff --git a/src/microcalibrate/hyperparameter_tuning.py b/src/microcalibrate/hyperparameter_tuning.py
new file mode 100644
index 0000000..e434b12
--- /dev/null
+++ b/src/microcalibrate/hyperparameter_tuning.py
@@ -0,0 +1,450 @@
+"""Hyperparameter tuning functionality for calibration."""
+
+import logging
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import optuna
+import pandas as pd
+import torch
+
+from microcalibrate.utils.metrics import loss, pct_close
+
+logger = logging.getLogger(__name__)
+
+
+def _evaluate_single_holdout(
+    calibration,
+    holdout_set: Dict[str, Any],
+    hyperparameters: Dict[str, float],
+    epochs_per_trial: int,
+    objectives_balance: Dict[str, float],
+) -> Dict[str, Any]:
+    """Evaluate hyperparameters on a single holdout set.
+
+    Args:
+        calibration: Calibration instance
+        holdout_set: Dictionary with 'names' and 'indices' of holdout targets
+        hyperparameters: Dictionary with l0_lambda, init_mean, temperature
+        epochs_per_trial: Number of epochs to run
+        objectives_balance: Weights for different objectives
+
+    Returns:
+        Dictionary with evaluation metrics and holdout target names
+    """
+    # Store original parameters
+    original_params = {
+        "l0_lambda": calibration.l0_lambda,
+        "init_mean": calibration.init_mean,
+        "temperature": calibration.temperature,
+        "regularize_with_l0": calibration.regularize_with_l0,
+        "epochs": calibration.epochs,
+    }
+
+    try:
+        # Update parameters for this evaluation
+        calibration.l0_lambda = hyperparameters["l0_lambda"]
+        calibration.init_mean = hyperparameters["init_mean"]
+        calibration.temperature = hyperparameters["temperature"]
+        calibration.regularize_with_l0 = True
+        calibration.epochs = epochs_per_trial
+
+        # Set up calibration with this holdout set
+        calibration.excluded_targets = holdout_set["names"]
+        calibration.exclude_targets()
+
+        # Run calibration
+        calibration.calibrate()
+        sparse_weights = calibration.sparse_weights
+
+        # Get estimates for all targets
+        weights_tensor = torch.tensor(
+            sparse_weights, dtype=torch.float32, device=calibration.device
+        )
+
+        if calibration.original_estimate_matrix is not None:
+            original_matrix_tensor = torch.tensor(
+                calibration.original_estimate_matrix.values,
+                dtype=torch.float32,
+                device=calibration.device,
+            )
+            all_estimates = (
+                (weights_tensor @ original_matrix_tensor).cpu().numpy()
+            )
+        else:
+            all_estimates = (
+                calibration.original_estimate_function(weights_tensor)
+                .cpu()
+                .numpy()
+            )
+
+        # Split into train/validation
+        n_targets = len(calibration.original_target_names)
+        val_indices = holdout_set["indices"]
+        train_indices = [i for i in range(n_targets) if i not in val_indices]
+
+        val_estimates = all_estimates[val_indices]
+        val_targets = calibration.original_targets[val_indices]
+        train_estimates = all_estimates[train_indices]
+        train_targets = calibration.original_targets[train_indices]
+
+        # Calculate metrics
+        val_loss = loss(
+            torch.tensor(
+                val_estimates, dtype=torch.float32, device=calibration.device
+            ),
+            torch.tensor(
+                val_targets, dtype=torch.float32, device=calibration.device
+            ),
+            None,
+        ).item()
+
+        val_accuracy = pct_close(
+            torch.tensor(
+                val_estimates, dtype=torch.float32, device=calibration.device
+            ),
+            torch.tensor(
+                val_targets, dtype=torch.float32, device=calibration.device
+            ),
+        )
+
+        train_loss = loss(
+            torch.tensor(
+                train_estimates, dtype=torch.float32, device=calibration.device
+            ),
+            torch.tensor(
+                train_targets, dtype=torch.float32, device=calibration.device
+            ),
+            None,
+        ).item()
+
+        train_accuracy = pct_close(
+            torch.tensor(
+                train_estimates, dtype=torch.float32, device=calibration.device
+            ),
+            torch.tensor(
+                train_targets, dtype=torch.float32, device=calibration.device
+            ),
+        )
+
+        sparsity = np.mean(sparse_weights == 0)
+
+        # Calculate objective
+        objective = (
+            val_loss * objectives_balance["loss"]
+            + (1 - val_accuracy) * objectives_balance["accuracy"]
+            + (1 - sparsity) * objectives_balance["sparsity"]
+        )
+
+        return {
+            "objective": objective,
+            "val_loss": val_loss,
+            "val_accuracy": val_accuracy,
+            "train_loss": train_loss,
+            "train_accuracy": train_accuracy,
+            "sparsity": sparsity,
+            "n_nonzero_weights": int(np.sum(sparse_weights != 0)),
+            "holdout_targets": holdout_set["names"],
+            "hyperparameters": hyperparameters.copy(),
+        }
+
+    finally:
+        # Restore original parameters
+        for key, value in original_params.items():
+            setattr(calibration, key, value)
+
+
+def _create_objective_function(
+    calibration,
+    holdout_sets: List[Dict[str, Any]],
+    epochs_per_trial: int,
+    objectives_balance: Dict[str, float],
+    aggregation: str,
+    all_evaluations: List,
+    original_state: Dict,
+):
+    """Create the objective function for Optuna optimization.
+
+    Args:
+        calibration: Calibration instance
+        holdout_sets: List of holdout sets
+        epochs_per_trial: Number of epochs per trial
+        objectives_balance: Weights for different objectives
+        aggregation: How to aggregate results across holdouts
+        all_evaluations: List to collect evaluation records
+        original_state: Original calibration state to restore
+
+    Returns:
+        Objective function for Optuna
+    """
+
+    def objective(trial: optuna.Trial) -> float:
+        """Objective function for Optuna optimization."""
+        try:
+            # Suggest hyperparameters
+            hyperparameters = {
+                "l0_lambda": trial.suggest_float(
+                    "l0_lambda", 1e-6, 1e-4, log=True
+                ),
+                "init_mean": trial.suggest_float("init_mean", 0.5, 0.999),
+                "temperature": trial.suggest_float("temperature", 0.5, 2.0),
+            }
+
+            # Evaluate on all holdout sets
+            holdout_results = []
+            for holdout_idx, holdout_set in enumerate(holdout_sets):
+                result = _evaluate_single_holdout(
+                    calibration=calibration,
+                    holdout_set=holdout_set,
+                    hyperparameters=hyperparameters,
+                    epochs_per_trial=epochs_per_trial,
+                    objectives_balance=objectives_balance,
+                )
+                # Add trial and holdout identifiers for tracking
+                evaluation_record = result.copy()
+                evaluation_record["trial_number"] = trial.number
+                evaluation_record["holdout_set_idx"] = holdout_idx
+                all_evaluations.append(evaluation_record)
+                holdout_results.append(result)
+
+            # Aggregate objectives
+            objectives = [r["objective"] for r in holdout_results]
+
+            if aggregation == "mean":
+                final_objective = np.mean(objectives)
+            elif aggregation == "median":
+                final_objective = np.median(objectives)
+            elif aggregation == "worst":
+                final_objective = np.max(objectives)
+            else:
+                raise ValueError(f"Unknown aggregation method: {aggregation}")
+
+            # Store detailed metrics
+            trial.set_user_attr(
+                "holdout_objectives", [r["objective"] for r in holdout_results]
+            )
+            trial.set_user_attr(
+                "mean_val_loss",
+                np.mean([r["val_loss"] for r in holdout_results]),
+            )
+            trial.set_user_attr(
+                "std_val_loss",
+                np.std([r["val_loss"] for r in holdout_results]),
+            )
+            trial.set_user_attr(
+                "mean_val_accuracy",
+                np.mean([r["val_accuracy"] for r in holdout_results]),
+            )
+            trial.set_user_attr(
+                "std_val_accuracy",
+                np.std([r["val_accuracy"] for r in holdout_results]),
+            )
+            trial.set_user_attr(
+                "mean_train_loss",
+                np.mean([r["train_loss"] for r in holdout_results]),
+            )
+            trial.set_user_attr(
+                "mean_train_accuracy",
+                np.mean([r["train_accuracy"] for r in holdout_results]),
+            )
+
+            # Use the last holdout's sparsity metrics
+            last_result = holdout_results[-1]
+            trial.set_user_attr("sparsity", last_result["sparsity"])
+            trial.set_user_attr(
+                "n_nonzero_weights", last_result.get("n_nonzero_weights", 0)
+            )
+
+            # Log progress
+            if trial.number % 5 == 0:
+                objectives = [r["objective"] for r in holdout_results]
+                val_accuracies = [r["val_accuracy"] for r in holdout_results]
+                logger.info(
+                    f"Trial {trial.number}:\n"
+                    f"  Objectives by holdout: {[f'{obj:.4f}' for obj in objectives]}\n"
+                    f"  {aggregation.capitalize()} objective: {final_objective:.4f}\n"
+                    f"  Mean val accuracy: {np.mean(val_accuracies):.2%} (±{np.std(val_accuracies):.2%})\n"
+                    f"  Sparsity: {last_result['sparsity']:.2%}"
+                )
+
+            return final_objective
+
+        except Exception as e:
+            logger.warning(f"Trial {trial.number} failed: {str(e)}")
+            return 1e10
+
+        finally:
+            # Restore original state
+            calibration.excluded_targets = original_state["excluded_targets"]
+            calibration.targets = original_state["targets"]
+            calibration.target_names = original_state["target_names"]
+            calibration.exclude_targets()
+
+    return objective
+
+
+def tune_l0_hyperparameters(
+    calibration,
+    n_trials: Optional[int] = 30,
+    objectives_balance: Optional[Dict[str, float]] = None,
+    epochs_per_trial: Optional[int] = None,
+    n_holdout_sets: Optional[int] = 3,
+    holdout_fraction: Optional[float] = 0.2,
+    aggregation: Optional[str] = "mean",
+    timeout: Optional[float] = None,
+    n_jobs: Optional[int] = 1,
+    study_name: Optional[str] = None,
+    storage: Optional[str] = None,
+    load_if_exists: Optional[bool] = False,
+    direction: Optional[str] = "minimize",
+    sampler: Optional["optuna.samplers.BaseSampler"] = None,
+    pruner: Optional["optuna.pruners.BasePruner"] = None,
+) -> Dict[str, Any]:
+    """
+    Tune hyperparameters for L0 regularization using Optuna.
+
+    This method optimizes l0_lambda, init_mean, and temperature to achieve:
+    1. Low calibration loss
+    2. High percentage of targets within 10% of their true values
+    3. Sparse weights (fewer non-zero weights)
+
+    Args:
+        calibration: Calibration instance to tune
+        n_trials: Number of optimization trials to run.
+        objectives_balance: Dictionary to balance the importance of loss, accuracy, and sparsity
+            in the objective function. Default prioritizes being within 10% of targets.
+        epochs_per_trial: Number of epochs per trial. If None, uses calibration.epochs // 4.
+        n_holdout_sets: Number of different holdout sets to create and evaluate on
+        holdout_fraction: Fraction of targets in each holdout set
+        aggregation: How to combine scores across holdouts ("mean", "median", "worst")
+        timeout: Stop study after this many seconds. None means no timeout.
+        n_jobs: Number of parallel jobs. -1 means using all processors.
+        study_name: Name of the study for storage.
+        storage: Database URL for distributed optimization.
+        load_if_exists: Whether to load existing study.
+        direction: Optimization direction ('minimize' or 'maximize').
+        sampler: Optuna sampler for hyperparameter suggestions.
+        pruner: Optuna pruner for early stopping of trials.
+
+    Returns:
+        Dictionary containing the best hyperparameters found.
+    """
+    # Suppress Optuna's logs during optimization
+    optuna.logging.set_verbosity(optuna.logging.WARNING)
+
+    if objectives_balance is None:
+        objectives_balance = {"loss": 1.0, "accuracy": 100.0, "sparsity": 10.0}
+
+    if epochs_per_trial is None:
+        epochs_per_trial = max(calibration.epochs // 4, 100)
+
+    holdout_sets = calibration._create_holdout_sets(
+        n_holdout_sets, holdout_fraction, calibration.seed
+    )
+
+    logger.info(
+        f"Multi-holdout hyperparameter tuning:\n"
+        f"  - {n_holdout_sets} holdout sets\n"
+        f"  - {len(holdout_sets[0]['indices'])} targets per holdout ({holdout_fraction:.1%})\n"
+        f"  - Aggregation: {aggregation}\n"
+    )
+
+    logger.warning(
+        "Data leakage warning: Targets often share overlapping information "
+        "(e.g., geographic breakdowns like 'snap in CA' and 'snap in US'). "
+        "Holdout validation may not provide complete isolation between training and validation sets. "
+        "The robustness metrics should be interpreted with this limitation in mind - "
+        "they may overestimate the model's true generalization performance."
+    )
+
+    # Store original state
+    original_state = {
+        "excluded_targets": calibration.excluded_targets,
+        "targets": calibration.targets.copy(),
+        "target_names": (
+            calibration.target_names.copy()
+            if calibration.target_names is not None
+            else None
+        ),
+    }
+
+    # Initialize list to collect all holdout evaluations
+    all_evaluations = []
+
+    # Create objective function
+    objective = _create_objective_function(
+        calibration=calibration,
+        holdout_sets=holdout_sets,
+        epochs_per_trial=epochs_per_trial,
+        objectives_balance=objectives_balance,
+        aggregation=aggregation,
+        all_evaluations=all_evaluations,
+        original_state=original_state,
+    )
+
+    # Create or load study
+    if sampler is None:
+        sampler = optuna.samplers.TPESampler(seed=calibration.seed)
+
+    study = optuna.create_study(
+        study_name=study_name,
+        storage=storage,
+        load_if_exists=load_if_exists,
+        direction=direction,
+        sampler=sampler,
+        pruner=pruner,
+    )
+
+    # Run optimization
+    study.optimize(
+        objective,
+        n_trials=n_trials,
+        timeout=timeout,
+        n_jobs=n_jobs,
+        show_progress_bar=True,
+    )
+
+    # Get best parameters
+    best_params = study.best_params
+    best_trial = study.best_trial
+    best_params["mean_val_loss"] = best_trial.user_attrs.get("mean_val_loss")
+    best_params["std_val_loss"] = best_trial.user_attrs.get("std_val_loss")
+    best_params["mean_val_accuracy"] = best_trial.user_attrs.get(
+        "mean_val_accuracy"
+    )
+    best_params["std_val_accuracy"] = best_trial.user_attrs.get(
+        "std_val_accuracy"
+    )
+    best_params["holdout_objectives"] = best_trial.user_attrs.get(
+        "holdout_objectives"
+    )
+    best_params["sparsity"] = best_trial.user_attrs.get("sparsity")
+    best_params["n_holdout_sets"] = n_holdout_sets
+    best_params["aggregation"] = aggregation
+
+    # Create evaluation tracking dataframe
+    evaluation_df = pd.DataFrame(all_evaluations)
+
+    # Convert holdout_targets list to string for easier viewing
+    if "holdout_targets" in evaluation_df.columns:
+        evaluation_df["holdout_targets"] = evaluation_df[
+            "holdout_targets"
+        ].apply(lambda x: ", ".join(x) if isinstance(x, list) else str(x))
+
+    best_params["evaluation_history"] = evaluation_df
+
+    logger.info(
+        f"\nMulti-holdout tuning completed!"
+        f"\nBest parameters:"
+        f"\n  - l0_lambda: {best_params['l0_lambda']:.2e}"
+        f"\n  - init_mean: {best_params['init_mean']:.4f}"
+        f"\n  - temperature: {best_params['temperature']:.4f}"
+        f"\nPerformance across {n_holdout_sets} holdouts:"
+        f"\n  - Mean val loss: {best_params['mean_val_loss']:.6f} (±{best_params['std_val_loss']:.6f})"
+        f"\n  - Mean val accuracy: {best_params['mean_val_accuracy']:.2%} (±{best_params['std_val_accuracy']:.2%})"
+        f"\n  - Individual objectives: {[f'{obj:.4f}' for obj in best_params['holdout_objectives']]}"
+        f"\n  - Sparsity: {best_params['sparsity']:.2%}"
+        f"\n\nEvaluation history saved with {len(evaluation_df)} records across {n_trials} trials."
+    )
+
+    return best_params
diff --git a/src/microcalibrate/utils/metrics.py b/src/microcalibrate/utils/metrics.py
index da55edf..37237ad 100644
--- a/src/microcalibrate/utils/metrics.py
+++ b/src/microcalibrate/utils/metrics.py
@@ -30,18 +30,18 @@ def loss(
 
 def pct_close(
     estimate: torch.Tensor,
-    targets_array: torch.Tensor,
+    targets: torch.Tensor,
     t: Optional[float] = 0.1,
 ) -> float:
     """Calculate the percentage of estimates close to targets.
 
     Args:
         estimate (torch.Tensor): Current estimates in log space.
-        targets_array (torch.Tensor): Array of target values.
+        targets (torch.Tensor): Array of target values.
         t (float): Optional threshold for closeness.
 
     Returns:
         float: Percentage of estimates within the threshold.
     """
-    abs_error = torch.abs((estimate - targets_array) / (1 + targets_array))
+    abs_error = torch.abs((estimate - targets) / (1 + targets))
     return ((abs_error < t).sum() / abs_error.numel()).item()
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
index 1a7b371..d6de1d3 100644
--- a/tests/test_evaluation.py
+++ b/tests/test_evaluation.py
@@ -104,3 +104,243 @@ def test_all_within_tolerance():
     np.testing.assert_array_almost_equal(
         result_df["distances"], [0.1, 0.2, 0.0]
     )
+
+
+def test_evaluate_holdout_robustness():
+    """Test the holdout robustness evaluation functionality."""
+
+    # Create a more complex mock dataset with multiple features
+    random_generator = np.random.default_rng(42)
+    n_samples = 500
+
+    data = pd.DataFrame(
+        {
+            "age": random_generator.integers(18, 80, size=n_samples),
+            "income": random_generator.lognormal(10.5, 0.7, size=n_samples),
+            "region": random_generator.choice(
+                ["North", "South", "East", "West"], size=n_samples
+            ),
+            "employed": random_generator.binomial(1, 0.7, size=n_samples),
+        }
+    )
+
+    weights = random_generator.uniform(0.5, 1.5, size=n_samples)
+    weights = weights / weights.sum() * n_samples
+
+    estimate_matrix = pd.DataFrame(
+        {
+            "total_population": np.ones(n_samples),
+            "employed_count": data["employed"].astype(float),
+            "income_north": (
+                (data["region"] == "North") * data["income"]
+            ).astype(float),
+            "income_south": (
+                (data["region"] == "South") * data["income"]
+            ).astype(float),
+            "income_east": (
+                (data["region"] == "East") * data["income"]
+            ).astype(float),
+            "income_west": (
+                (data["region"] == "West") * data["income"]
+            ).astype(float),
+            "young_employed": (
+                (data["age"] < 30) & (data["employed"] == 1)
+            ).astype(float),
+            "senior_count": (data["age"] >= 65).astype(float),
+        }
+    )
+
+    targets = np.array(
+        [
+            n_samples * 1.05,
+            (estimate_matrix["employed_count"] * weights).sum() * 0.95,
+            (estimate_matrix["income_north"] * weights).sum() * 1.1,
+            (estimate_matrix["income_south"] * weights).sum() * 0.9,
+            (estimate_matrix["income_east"] * weights).sum() * 1.05,
+            (estimate_matrix["income_west"] * weights).sum() * 0.98,
+            (estimate_matrix["young_employed"] * weights).sum() * 1.15,
+            (estimate_matrix["senior_count"] * weights).sum() * 0.92,
+        ]
+    )
+
+    calibrator = Calibration(
+        estimate_matrix=estimate_matrix,
+        weights=weights,
+        targets=targets,
+        noise_level=0.1,
+        epochs=100,
+        learning_rate=0.01,
+        dropout_rate=0.05,
+        seed=42,
+    )
+    calibrator.calibrate()
+
+    # Test basic robustness evaluation
+    results = calibrator.evaluate_holdout_robustness(
+        n_holdout_sets=3,
+        holdout_fraction=0.25,
+        save_results_to=None,  # pass a str path if you want to save and explore results' dataframes
+    )
+
+    # Check structure of results
+    assert "overall_metrics" in results
+    assert "target_robustness" in results
+    assert "recommendation" in results
+    assert "detailed_results" in results
+
+    # Check overall metrics
+    metrics = results["overall_metrics"]
+    assert "mean_holdout_loss" in metrics
+    assert "std_holdout_loss" in metrics
+    assert "mean_holdout_accuracy" in metrics
+    assert "std_holdout_accuracy" in metrics
+    assert "worst_holdout_accuracy" in metrics
+    assert "best_holdout_accuracy" in metrics
+    assert "mean_generalization_gap" in metrics
+    assert metrics["n_successful_evaluations"] == 3
+    assert metrics["n_failed_evaluations"] == 0
+
+    # Check that accuracy is between 0 and 1
+    assert 0 <= metrics["mean_holdout_accuracy"] <= 1
+    assert 0 <= metrics["worst_holdout_accuracy"] <= 1
+    assert 0 <= metrics["best_holdout_accuracy"] <= 1
+
+    # Check target robustness DataFrame
+    robustness_df = results["target_robustness"]
+    assert isinstance(robustness_df, pd.DataFrame)
+    assert len(robustness_df) > 0  # At least some targets should be evaluated
+    assert "target_name" in robustness_df.columns
+    assert "times_held_out" in robustness_df.columns
+    assert "holdout_accuracy_rate" in robustness_df.columns
+    assert "mean_holdout_loss" in robustness_df.columns
+    assert robustness_df["holdout_accuracy_rate"].is_monotonic_increasing
+    assert isinstance(results["recommendation"], str)
+    assert len(results["recommendation"]) > 0
+    assert any(
+        word in results["recommendation"]
+        for word in ["ROBUSTNESS", "RECOMMENDATIONS"]
+    )
+
+    # Check detailed results
+    assert len(results["detailed_results"]) == 3
+    for detail in results["detailed_results"]:
+        assert "holdout_loss" in detail
+        assert "train_loss" in detail
+        assert "holdout_accuracy" in detail
+        assert "train_accuracy" in detail
+        assert "generalization_gap" in detail
+        assert "target_details" in detail
+        assert len(detail["target_details"]) == 2  # 25% of 8 targets
+
+    # Test error handling with invalid parameters
+    with pytest.raises(ValueError):
+        calibrator.evaluate_holdout_robustness(
+            n_holdout_sets=0,  # Invalid
+        )
+    with pytest.raises(ValueError):
+        calibrator.evaluate_holdout_robustness(
+            holdout_fraction=1.5,  # Invalid
+        )
+
+
+def test_evaluate_holdout_robustness_with_l0_regularization():
+    """Test robustness evaluation with L0 regularization enabled."""
+
+    # Create simple dataset
+    random_generator = np.random.default_rng(123)
+    n_samples = 200
+
+    estimate_matrix = pd.DataFrame(
+        {
+            "feature_1": random_generator.uniform(0.5, 1.5, n_samples),
+            "feature_2": random_generator.uniform(0.5, 1.5, n_samples),
+            "feature_3": random_generator.uniform(0.5, 1.5, n_samples),
+            "feature_redundant": random_generator.uniform(
+                0, 0.1, n_samples
+            ),  # Low signal
+        }
+    )
+
+    weights = np.ones(n_samples)
+    col_sums = estimate_matrix.sum()
+    targets = np.array(
+        [
+            col_sums["feature_1"] * 0.95,
+            col_sums["feature_2"] * 1.05,
+            col_sums["feature_3"] * 1.0,
+            col_sums["feature_redundant"] * 1.1,
+        ]
+    )
+
+    # Initialize with L0 regularization - aggressive parameters for sparsity
+    calibrator = Calibration(
+        estimate_matrix=estimate_matrix,
+        weights=weights,
+        targets=targets,
+        regularize_with_l0=True,
+        l0_lambda=1e-4,
+        init_mean=0.5,
+        temperature=0.3,
+        epochs=100,
+        seed=123,
+    )
+
+    calibrator.calibrate()
+
+    results = calibrator.evaluate_holdout_robustness(
+        n_holdout_sets=3,
+        holdout_fraction=0.25,
+    )
+    assert all(
+        "weights_sparsity" in detail for detail in results["detailed_results"]
+    )
+    sparsity_values = [
+        detail["weights_sparsity"] for detail in results["detailed_results"]
+    ]
+    assert max(sparsity_values) >= 0 or calibrator.sparse_weights is not None
+    assert results["overall_metrics"]["mean_holdout_accuracy"] >= 0
+
+
+def test_evaluate_holdout_robustness_recommendation_logic():
+    """Test the recommendation generation logic."""
+
+    # Create a calibrator with known poor performance
+    random_generator = np.random.default_rng(789)
+    n_samples = 50
+    base_feature = random_generator.normal(0, 1, n_samples)
+    estimate_matrix = pd.DataFrame(
+        {
+            "feature_1": base_feature
+            + random_generator.normal(0, 0.1, n_samples),
+            "feature_2": base_feature
+            + random_generator.normal(0, 0.1, n_samples),
+            "feature_3": base_feature
+            + random_generator.normal(0, 0.1, n_samples),
+        }
+    )
+
+    weights = np.ones(n_samples)
+    targets = np.array([100, 200, 300])
+
+    calibrator = Calibration(
+        estimate_matrix=estimate_matrix,
+        weights=weights,
+        targets=targets,
+        epochs=20,
+        noise_level=0.01,
+        dropout_rate=0,
+    )
+
+    calibrator.calibrate()
+    results = calibrator.evaluate_holdout_robustness(n_holdout_sets=3)
+
+    recommendation = results["recommendation"]
+    if results["overall_metrics"]["mean_holdout_accuracy"] < 0.7:
+        assert any(marker in recommendation for marker in ["⚠️", "❌"])
+    if not calibrator.regularize_with_l0:
+        assert "L0 regularization" in recommendation
+    problematic = results["target_robustness"][
+        results["target_robustness"]["holdout_accuracy_rate"] < 0.5
+    ]
+    if len(problematic) > 0:
+        assert "Targets with poor holdout performance" in recommendation
diff --git a/tests/test_regularization.py b/tests/test_regularization.py
index d074ea8..584192a 100644
--- a/tests/test_regularization.py
+++ b/tests/test_regularization.py
@@ -7,10 +7,12 @@
 import logging
 import numpy as np
 import pandas as pd
+import pytest
 
 
-def test_calibration_with_l0_regularization() -> None:
-    # Create a sample dataset for testing
+@pytest.fixture(scope="session")
+def test_data():
+    """Create sample dataset and targets for L0 hyperparameter tuning tests."""
     random_generator = np.random.default_rng(0)
     data = pd.DataFrame(
         {
@@ -21,7 +23,6 @@ def test_calibration_with_l0_regularization() -> None:
 
     weights = np.ones(len(data))
 
-    # Calculate target values:
     targets_matrix = pd.DataFrame(
         {
             "income_aged_20_30": (
@@ -56,6 +57,19 @@ def test_calibration_with_l0_regularization() -> None:
         ]
     )
 
+    return {
+        "targets_matrix": targets_matrix,
+        "weights": weights,
+        "targets": targets,
+    }
+
+
+def test_calibration_with_l0_regularization(test_data) -> None:
+    "Test calibration with L0 regularization."
+    targets_matrix = test_data["targets_matrix"]
+    weights = test_data["weights"]
+    targets = test_data["targets"]
+
     calibrator = Calibration(
         estimate_matrix=targets_matrix,
         weights=weights,
@@ -106,3 +120,224 @@ def test_calibration_with_l0_regularization() -> None:
     assert (
         percentage_below_threshold > 10
     ), f"Only {percentage_below_threshold:.1f}% of sparse weights are below 0.5 (expected > 10%)"
+
+
+def test_l0_hyperparameter_tuning_with_holdouts(test_data) -> None:
+    """Test L0 hyperparameter tuning with holdout validation."""
+    targets_matrix = test_data["targets_matrix"]
+    weights = test_data["weights"]
+    targets = test_data["targets"]
+
+    # Create calibrator instance
+    calibrator = Calibration(
+        estimate_matrix=targets_matrix,
+        weights=weights,
+        targets=targets,
+        noise_level=0.05,
+        epochs=200,
+        learning_rate=0.01,
+        dropout_rate=0,
+        regularize_with_l0=False,
+    )
+
+    # Test hyperparameter tuning
+    best_params = calibrator.tune_l0_hyperparameters(
+        n_trials=20,  # Fewer trials for testing
+        epochs_per_trial=50,  # Shorter epochs for quick testing
+        objectives_balance={
+            "loss": 1.0,
+            "accuracy": 100.0,  # Prioritize hitting targets
+            "sparsity": 30.0,
+        },
+        n_jobs=1,
+    )
+
+    # Verify that best_params contains expected keys
+    assert "l0_lambda" in best_params, "Missing l0_lambda in best parameters"
+    assert "init_mean" in best_params, "Missing init_mean in best parameters"
+    assert (
+        "temperature" in best_params
+    ), "Missing temperature in best parameters"
+    assert (
+        "mean_val_loss" in best_params
+    ), "Missing mean_val_loss in best parameters"
+    assert (
+        "mean_val_accuracy" in best_params
+    ), "Missing mean_val_accuracy in best parameters"
+    assert "sparsity" in best_params, "Missing sparsity in best parameters"
+    assert (
+        "holdout_objectives" in best_params
+    ), "Missing holdout_objectives in best parameters"
+
+    # Verify parameter ranges
+    assert (
+        1e-6 <= best_params["l0_lambda"] <= 1e-4
+    ), f"l0_lambda {best_params['l0_lambda']} out of range"
+    assert (
+        0.5 <= best_params["init_mean"] <= 0.999
+    ), f"init_mean {best_params['init_mean']} out of range"
+    assert (
+        0.5 <= best_params["temperature"] <= 2.0
+    ), f"temperature {best_params['temperature']} out of range"
+
+    # Verify metrics are reasonable
+    assert (
+        0 <= best_params["mean_val_accuracy"] <= 1
+    ), "mean_val_accuracy should be between 0 and 1"
+    assert (
+        0 <= best_params["sparsity"] <= 1
+    ), "sparsity should be between 0 and 1"
+    assert (
+        best_params["mean_val_loss"] >= 0
+    ), "mean_val_loss should be non-negative"
+
+    best_params["evaluation_history"].to_csv(
+        "tests/l0_hyperparameter_tuning_history_with_holdouts.csv", index=False
+    )
+
+    # Now run calibration with the best parameters
+    calibrator.l0_lambda = best_params["l0_lambda"]
+    calibrator.init_mean = best_params["init_mean"]
+    calibrator.temperature = best_params["temperature"]
+    calibrator.regularize_with_l0 = True
+
+    # Run the full calibration
+    performance_df = calibrator.calibrate()
+    sparse_weights = calibrator.sparse_weights
+
+    assert (
+        sparse_weights is not None
+    ), "Sparse weights should be generated with L0 regularization"
+
+    # Evaluate the final calibration
+    percentage_within_10 = evaluate_sparse_weights(
+        optimised_weights=sparse_weights,
+        estimate_matrix=targets_matrix,
+        targets_array=targets,
+    )
+
+    # The tuned parameters should give reasonable results
+    assert (
+        percentage_within_10 > 50
+    ), f"Only {percentage_within_10:.1f}% of targets within 10% (expected > 50%)"
+
+    # Check that we achieved some sparsity
+    actual_sparsity = np.mean(sparse_weights == 0)
+    assert (
+        actual_sparsity > 0.1
+    ), f"Sparsity {actual_sparsity:.1%} is too low (expected > 10%)"
+
+
+def test_l0_hyperparameter_tuning_without_holdouts(test_data) -> None:
+    """Test L0 hyperparameter tuning without holdout validation (simpler case)."""
+    targets_matrix = test_data["targets_matrix"]
+    weights = test_data["weights"]
+    targets = test_data["targets"]
+
+    # Create calibrator instance
+    calibrator = Calibration(
+        estimate_matrix=targets_matrix,
+        weights=weights,
+        targets=targets,
+        noise_level=0.05,
+        epochs=200,
+        learning_rate=0.01,
+        dropout_rate=0,
+        regularize_with_l0=False,
+    )
+
+    # Test hyperparameter tuning WITHOUT holdouts
+    best_params = calibrator.tune_l0_hyperparameters(
+        n_trials=10,
+        epochs_per_trial=30,
+        n_holdout_sets=1,  # Single holdout set
+        holdout_fraction=0,  # No holdouts - use all data for both training and validation
+        objectives_balance={
+            "loss": 1.0,
+            "accuracy": 100.0,
+            "sparsity": 30.0,
+        },
+        n_jobs=1,
+    )
+
+    # Verify that best_params contains expected keys
+    assert "l0_lambda" in best_params, "Missing l0_lambda in best parameters"
+    assert "init_mean" in best_params, "Missing init_mean in best parameters"
+    assert (
+        "temperature" in best_params
+    ), "Missing temperature in best parameters"
+    assert (
+        "mean_val_loss" in best_params
+    ), "Missing mean_val_loss in best parameters"
+    assert (
+        "mean_val_accuracy" in best_params
+    ), "Missing mean_val_accuracy in best parameters"
+    assert "sparsity" in best_params, "Missing sparsity in best parameters"
+
+    # Verify parameter ranges
+    assert (
+        1e-6 <= best_params["l0_lambda"] <= 1e-4
+    ), f"l0_lambda {best_params['l0_lambda']} out of range"
+    assert (
+        0.5 <= best_params["init_mean"] <= 0.999
+    ), f"init_mean {best_params['init_mean']} out of range"
+    assert (
+        0.5 <= best_params["temperature"] <= 2.0
+    ), f"temperature {best_params['temperature']} out of range"
+
+    # Verify metrics are reasonable
+    assert (
+        0 <= best_params["mean_val_accuracy"] <= 1
+    ), "mean_val_accuracy should be between 0 and 1"
+    assert (
+        0 <= best_params["sparsity"] <= 1
+    ), "sparsity should be between 0 and 1"
+    assert (
+        best_params["mean_val_loss"] >= 0
+    ), "mean_val_loss should be non-negative"
+
+    # When there are no holdouts, n_holdout_sets should be 1 and aggregation should work
+    assert best_params["n_holdout_sets"] == 1, "Should have 1 holdout set"
+    assert (
+        "holdout_objectives" in best_params
+    ), "Should still have holdout_objectives"
+    assert (
+        len(best_params["holdout_objectives"]) == 1
+    ), "Should have exactly 1 objective"
+
+    best_params["evaluation_history"].to_csv(
+        "tests/l0_hyperparameter_tuning_history_without_holdouts.csv",
+        index=False,
+    )
+
+    # Run calibration with the best parameters
+    calibrator.l0_lambda = best_params["l0_lambda"]
+    calibrator.init_mean = best_params["init_mean"]
+    calibrator.temperature = best_params["temperature"]
+    calibrator.regularize_with_l0 = True
+
+    # Run the full calibration
+    performance_df = calibrator.calibrate()
+    sparse_weights = calibrator.sparse_weights
+
+    assert (
+        sparse_weights is not None
+    ), "Sparse weights should be generated with L0 regularization"
+
+    # Evaluate the final calibration
+    percentage_within_10 = evaluate_sparse_weights(
+        optimised_weights=sparse_weights,
+        estimate_matrix=targets_matrix,
+        targets_array=targets,
+    )
+
+    # The tuned parameters should give reasonable results
+    assert (
+        percentage_within_10 > 50
+    ), f"Only {percentage_within_10:.1f}% of targets within 10% (expected > 50%)"
+
+    # Check that we achieved some sparsity
+    actual_sparsity = np.mean(sparse_weights == 0)
+    assert (
+        actual_sparsity > 0.05
+    ), f"Sparsity {actual_sparsity:.1%} is too low (expected > 5%)"