PolicyEngine · juaristi22 · Aug 22, 2025 · Aug 20, 2025 · Aug 20, 2025 · Aug 21, 2025
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -9,7 +9,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.11"]
+        python-version: ["3.13"]
 
     steps:
       - name: Checkout repo

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -11,7 +11,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: "3.11"
+          python-version: "3.13"
       - name: Install uv
         uses: astral-sh/setup-uv@v5
       - name: Install relevant dependencies
@@ -24,7 +24,7 @@ jobs:
     strategy:
       matrix:
         os: [ ubuntu-latest ]
-        python-version: ["3.11"]
+        python-version: ["3.13"]
       fail-fast: false
     runs-on: ${{ matrix.os }}
     steps:
@@ -58,7 +58,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: "3.11"
+          python-version: "3.13"
       - name: Install dependencies
         run: |
           uv pip install -e ".[dev,docs]" --system

diff --git a/.github/workflows/versioning.yaml b/.github/workflows/versioning.yaml
@@ -25,7 +25,7 @@ jobs:
             - name: Setup Python
               uses: actions/setup-python@v5
               with:
-                python-version: 3.11
+                python-version: 3.13
             - name: Build changelog
               run: pip install yaml-changelog && make changelog
             - name: Preview changelog update

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: patch
+  changes:
+    fixed:
+      - Logic to add calculated variables (not input variables) to SingleYearDataset when loading from a microsim.
diff --git a/docs/calibration.ipynb b/docs/calibration.ipynb
@@ -59,7 +59,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "5a58bd2b",
    "metadata": {},
    "outputs": [
@@ -92,12 +92,12 @@
     "    db_uri=db_uri, update_database=True\n",
     ")\n",
     "\n",
-    "# Uprate targets for consistency across definition year (disabled until IRS SOI variables are renamed to avoid errors)\n",
-    "# uprating_results = uprate_calibration_targets(\n",
-    "#     system=system, db_uri=db_uri, \n",
-    "#     from_period=2022, to_period=2023, \n",
-    "#     update_database=True\n",
-    "# )"
+    "# Uprate targets for consistency across definition year\n",
+    "uprating_results = uprate_calibration_targets(\n",
+    "    system=system, db_uri=db_uri, \n",
+    "    from_period=2022, to_period=2023, \n",
+    "    update_database=True\n",
+    ")"
    ]
   },
   {
@@ -118,7 +118,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "c75954d1",
    "metadata": {},
    "outputs": [
@@ -197,7 +197,8 @@
     "    dataset_subsample_size=10000,  # Small sample for faster execution\n",
     "    use_dataset_weights=False,  # Start with equal weights\n",
     "    regularize_with_l0=True,  # Enable sparsity\n",
-    "    noise_level=10.0\n",
+    "    noise_level=10.0,\n",
+    "    raise_error=False,\n",
     ")\n",
     "\n",
     "# Examine the results\n",
@@ -225,7 +226,8 @@
     "    stack_datasets=False,  # Don't stack since we're using pre-stacked data\n",
     "    noise_level=0.0,  # Minimal noise to preserve state calibration\n",
     "    use_dataset_weights=True,  # Start from state-calibrated weights\n",
-    "    regularize_with_l0=False  # No sparsity at national level\n",
+    "    regularize_with_l0=False,  # No sparsity at national level\n",
+    "    raise_error=False,\n",
     ")\n",
     "\n",
     "# Compare results\n",

diff --git a/docs/dataset.ipynb b/docs/dataset.ipynb
@@ -169,9 +169,17 @@
     "#### Method 3: From a PolicyEngine MicroSimulation"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "f7545881",
+   "metadata": {},
+   "source": [
+    "Note that, by default, the `from_simulation()` method only loads input variables, which come from the underlying dataset. This may create problems later on if when trying to calculate specific variables using the loaded SingleYearDataset without running the whole microsimulation. The `include_all_variables` parameter, when set to `True` will calculate all variables in the microsimulation object when loading the SingleYearDataset to avoid problems down the line. However, this is quite computationally heavy. Instead, the parameter `additional_variables` enables passing a dictionary of specific \"calculated\" variables to be loaded, avoiding computing variables that will not be needed."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "4f1a90f6",
    "metadata": {},
    "outputs": [
@@ -192,7 +200,12 @@
     "\n",
     "sim = Microsimulation(dataset=dataset)\n",
     "\n",
-    "single_year_dataset = SingleYearDataset.from_simulation(sim, time_period=start_year)\n",
+    "single_year_dataset = SingleYearDataset.from_simulation(\n",
+    "    sim, \n",
+    "    time_period=start_year,\n",
+    "    include_all_variables=False,\n",
+    "    additional_variables={\"person\": [\"employment_income\"]},\n",
+    ")\n",
     "single_year_dataset.time_period = start_year\n",
     "\n",
     "print(f\"Dataset created from PolicyEngine US microdata stored in {dataset}\")\n",

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,14 +6,14 @@ readme = "README.md"
 authors = [
     {name = "PolicyEngine", email = "[email protected]"},
 ]
-requires-python = ">=3.11"
+requires-python = ">=3.13"
 dependencies = [
     "h5py",
     "numpy",
     "pandas",
     "huggingface_hub>=0.25.1",
-    "tables",
-    "policyengine-core>=3.6.4",
+    "tables>=3.10.2",
+    "policyengine-core>=3.20.0",
     "policyengine-us", # remove as soon as we fix UCGID
     "microdf-python",
     "microcalibrate",
@@ -32,7 +32,7 @@ dev = [
     "build",
     "linecheck",
     "yaml-changelog>=0.1.7",
-    "policyengine-us>=1.366.0",
+    "policyengine-us==1.370.1",
 ]
 
 docs = [
@@ -69,7 +69,7 @@ line_length = 79
 
 [tool.black]
 line-length = 79
-target-version = ["py311"]
+target-version = ["py313"]
 
 [project.scripts]
 policyengine-data = "policyengine_data:main"

diff --git a/src/policyengine_data/calibration/calibrate.py b/src/policyengine_data/calibration/calibrate.py
@@ -10,6 +10,7 @@
 
 from policyengine_data import SingleYearDataset, normalise_table_keys
 from policyengine_data.calibration.dataset_duplication import (
+    identify_calculated_variables,
     load_dataset_for_geography_legacy,
     minimize_calibrated_dataset_legacy,
 )
@@ -45,6 +46,7 @@ def calibrate_single_geography_level(
     year: Optional[int] = 2023,
     db_uri: Optional[str] = None,
     noise_level: Optional[float] = 10.0,
+    epochs: Optional[int] = 600,
     use_dataset_weights: Optional[bool] = True,
     regularize_with_l0: Optional[bool] = False,
     calibration_log_path: Optional[str] = None,
@@ -71,6 +73,7 @@ def calibrate_single_geography_level(
         geo_sim_filter_variable (str): The variable used to filter the simulation by geography. Default in the US: "ucgid".
         db_uri (Optional[str]): The URI of the database to use for rescaling targets. If None, it will download the database from the default URI.
         noise_level (Optional[float]): The level of noise to apply during calibration. Default: 10.0.
+        epochs (Optional[int]): The number of training epochs to use for calibration. Default: 600.
         use_dataset_weights (Optional[bool]): Whether to use original dataset weights as the starting weights for calibration. Default: True.
         regularize_with_l0 (Optional[bool]): Whether to use L0 regularization during calibration. Default: False.
         calibration_log_path (Optional[str]): The path to the calibration log file. If None, calibration log CSVs will not be saved.
@@ -82,6 +85,14 @@ def calibrate_single_geography_level(
     if db_uri is None:
         db_uri = download_database()
 
+    # Identify calculated variables from the base dataset to preserve them
+    important_calculated_vars = identify_calculated_variables(
+        dataset, microsimulation_class
+    )
+    logger.info(
+        f"Identified calculated variables to preserve: {important_calculated_vars}"
+    )
+
     geography_level_calibrated_dataset = None
     for area, geo_identifier in calibration_areas.items():
         logger.info(f"Calibrating dataset for {area}...")
@@ -148,7 +159,7 @@ def calibrate_single_geography_level(
             targets=targets,
             target_names=target_names,
             estimate_matrix=metrics_matrix,
-            epochs=600,
+            epochs=epochs,
             learning_rate=0.2,
             noise_level=noise_level,
             excluded_targets=(
@@ -172,6 +183,8 @@ def calibrate_single_geography_level(
                 if regularize_with_l0
                 else optimized_weights
             ),
+            include_all_variables=False,  # Use important variables for efficiency
+            important_variables=important_calculated_vars,
         )
 
         # Detect ids that require resetting after minimization
@@ -249,6 +262,7 @@ def calibrate_all_levels(
     year: Optional[int] = 2023,
     db_uri: Optional[str] = None,
     noise_level: Optional[float] = 10.0,
+    epochs: Optional[int] = 600,
     regularize_with_l0: Optional[bool] = False,
     raise_error: Optional[bool] = True,
 ) -> "SingleYearDataset":
@@ -272,6 +286,7 @@ def calibrate_all_levels(
         year (Optional[int]): The year to use for calibration. Default: 2023.
         db_uri (Optional[str]): The database URI to use for calibration. If None, it will download the database from the default URI.
         noise_level (Optional[float]): The noise level to use for calibration. Default: 10.0.
+        epochs (Optional[int]): The number of training epochs to use for calibration. Default: 600.
         regularize_with_l0 (Optional[bool]): Whether to use L0 regularization for calibration. Default: False.
         raise_error (Optional[bool]): Whether to raise an error if matrix creation fails. Default: True.
 
@@ -281,6 +296,14 @@ def calibrate_all_levels(
     if db_uri is None:
         db_uri = download_database()
 
+    # Identify calculated variables from the base dataset to preserve them
+    important_calculated_vars = identify_calculated_variables(
+        dataset, microsimulation_class
+    )
+    logger.info(
+        f"Identified calculated variables to preserve: {important_calculated_vars}"
+    )
+
     stacked_dataset = None
     for area, geo_identifier in database_stacking_areas.items():
         logger.info(f"Stacking dataset for {area}...")
@@ -304,6 +327,8 @@ def calibrate_all_levels(
         single_year_dataset = SingleYearDataset.from_simulation(
             simulation=sim_data_to_stack,
             time_period=year,
+            include_all_variables=False,  # Use important variables for efficiency
+            additional_variables=important_calculated_vars,
         )
 
         # Detect ids that require resetting
@@ -408,7 +433,7 @@ def calibrate_all_levels(
         targets=targets,
         target_names=target_names,
         estimate_matrix=metrics_matrix,
-        epochs=600,
+        epochs=epochs,
         learning_rate=0.2,
         noise_level=noise_level,
         excluded_targets=(
@@ -437,6 +462,8 @@ def calibrate_all_levels(
             if regularize_with_l0
             else optimized_weights
         ),
+        include_all_variables=False,  # Use important variables for efficiency
+        important_variables=important_calculated_vars,
     )
 
     return fully_calibrated_dataset
@@ -513,14 +540,14 @@ def calibrate_all_levels(
         db_uri=db_uri, update_database=True
     )
 
-    # Uprate targets for consistency across definition year (disabled until IRS SOI variables are renamed to avoid errors)
-    # uprating_results = uprate_calibration_targets(
-    #     system=system,
-    #     db_uri=db_uri,
-    #     from_period=2022,
-    #     to_period=2023,
-    #     update_database=True,
-    # )
+    # Uprate targets for consistency across definition year
+    uprating_results = uprate_calibration_targets(
+        system=system,
+        db_uri=db_uri,
+        from_period=2022,
+        to_period=2023,
+        update_database=True,
+    )
 
     state_level_calibrated_dataset = calibrate_single_geography_level(
         Microsimulation,
@@ -529,6 +556,7 @@ def calibrate_all_levels(
         db_uri=db_uri,
         use_dataset_weights=False,
         regularize_with_l0=True,
+        raise_error=False,
     )
 
     state_level_weights = state_level_calibrated_dataset.entities["household"][
@@ -537,7 +565,7 @@ def calibrate_all_levels(
 
     SingleYearDataset_to_Dataset(
         state_level_calibrated_dataset,
-        output_path="Dataset_state_level_age_medicaid_snap_eitc_agi_targets.h5",
+        output_path="Dataset_state_level_Aug20.h5",
     )
 
     print("Completed calibration for state level dataset.")
@@ -550,12 +578,13 @@ def calibrate_all_levels(
     national_level_calibrated_dataset = calibrate_single_geography_level(
         Microsimulation,
         areas_in_national_level,
-        dataset="Dataset_state_level_age_medicaid_snap_eitc_agi_targets.h5",
+        dataset="Dataset_state_level_Aug20.h5",
         db_uri=db_uri,
         stack_datasets=False,
         noise_level=0.0,
         use_dataset_weights=True,
         regularize_with_l0=False,
+        raise_error=False,
     )
 
     national_level_weights = national_level_calibrated_dataset.entities[
@@ -564,7 +593,7 @@ def calibrate_all_levels(
 
     SingleYearDataset_to_Dataset(
         national_level_calibrated_dataset,
-        output_path="Dataset_national_level_age_medicaid_snap_eitc_agi_targets.h5",
+        output_path="Dataset_national_level_Aug20.h5",
     )
 
     print("Completed calibration for national level dataset.")