update database link to enable calibration in ci

juaristi22 · juaristi22 · commit 96a2c99dba8b · 2025-08-15T13:13:39.000+02:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,7 @@ dependencies = [
     "microcalibrate",
     "sqlalchemy",
     "huggingface_hub",
+    "torch",
 ]
 
 [project.optional-dependencies]
diff --git a/src/policyengine_data/calibration/__init__.py b/src/policyengine_data/calibration/__init__.py
@@ -8,3 +8,4 @@
     validate_metrics_matrix,
 )
 from .target_rescaling import download_database, rescale_calibration_targets
+from .utils import create_geographic_normalization_factor
diff --git a/src/policyengine_data/calibration/calibrate.py b/src/policyengine_data/calibration/calibrate.py
@@ -3,7 +3,7 @@
 """
 
 import logging
-from typing import Dict, Optional
+from typing import Dict, List, Optional
 
 import numpy as np
 import pandas as pd
@@ -21,6 +21,9 @@
     download_database,
     rescale_calibration_targets,
 )
+from policyengine_data.calibration.utils import (
+    create_geographic_normalization_factor,
+)
 from policyengine_data.tools.legacy_class_conversions import (
     SingleYearDataset_to_Dataset,
 )
@@ -99,7 +102,7 @@ def calibrate_single_geography_level(
     use_dataset_weights: Optional[bool] = True,
     regularize_with_l0: Optional[bool] = False,
     raise_error: Optional[bool] = True,
-):
+) -> "SingleYearDataset":
     """
     This function will calibrate the dataset for a specific geography level, defaulting to stacking the base dataset per area within it.
     It will handle conversion between dataset classes to enable:
@@ -291,17 +294,19 @@ def calibrate_single_geography_level(
     return geography_level_calibrated_dataset
 
 
+# TODO: create normalization factor to pass into Calibrator balancing targets at different levels
 def calibrate_all_levels(
     database_stacking_areas: Dict[str, str],
     dataset: str,
     dataset_subsample_size: Optional[int] = None,
     geo_sim_filter_variable: Optional[str] = "ucgid",
+    geo_hierarchy: Optional[List[str]] = None,
     year: Optional[int] = 2023,
     db_uri: Optional[str] = None,
     noise_level: Optional[float] = 10.0,
     regularize_with_l0: Optional[bool] = False,
     raise_error: Optional[bool] = True,
-):
+) -> "SingleYearDataset":
     """
     This function will calibrate the dataset for all geography levels in the database, defaulting to stacking the base dataset per area within the specified level (it is recommended to use the lowest in the hierarchy for stacking). (Eg. when calibrating for district, state and national levels in the US, this function will stack the CPS dataset for each district and calibrate the stacked dataset for the three levels' targets.)
     It will handle conversion between dataset classes to enable:
@@ -318,6 +323,7 @@ def calibrate_all_levels(
         dataset (str): Path to the base dataset to stack.
         dataset_subsample_size (Optional[int]): The size of the subsample to use for calibration.
         geo_sim_filter_variable (Optional[str]): The variable to use for geographic similarity filtering. Default in the US: "ucgid".
+        geo_hierarchy (Optional[List[str]]): The geographic hierarchy to use for calibration.
         year (Optional[int]): The year to use for calibration. Default: 2023.
         db_uri (Optional[str]): The database URI to use for calibration. If None, it will download the database from the default URI.
         noise_level (Optional[float]): The noise level to use for calibration. Default: 10.0.
@@ -438,6 +444,10 @@ def calibrate_all_levels(
         raise_error=raise_error,
     )
 
+    normalization_factor = create_geographic_normalization_factor(
+        geo_hierarchy=geo_hierarchy, target_info=target_info
+    )
+
     target_names = []
     excluded_targets = []
     for target_id, info in target_info.items():
@@ -462,6 +472,7 @@ def calibrate_all_levels(
         excluded_targets=(
             excluded_targets if len(excluded_targets) > 0 else None
         ),
+        normalization_factor=normalization_factor,
         sparse_learning_rate=0.1,
         regularize_with_l0=regularize_with_l0,
         csv_path=f"full_calibration.csv",
@@ -494,7 +505,6 @@ def calibrate_all_levels(
     state_level_calibrated_dataset = calibrate_single_geography_level(
         areas_in_state_level,
         "hf://policyengine/policyengine-us-data/cps_2023.h5",
-        db_uri="sqlite:///policy_data.db",
         use_dataset_weights=False,
         regularize_with_l0=True,
     )
@@ -504,7 +514,8 @@ def calibrate_all_levels(
     ].values
 
     SingleYearDataset_to_Dataset(
-        state_level_calibrated_dataset, output_path="Dataset_state_level.h5"
+        state_level_calibrated_dataset,
+        output_path="Dataset_state_level_age_medicaid_snap_eitc_agi_targets.h5",
     )
 
     print("Completed calibration for state level dataset.")
@@ -516,9 +527,8 @@ def calibrate_all_levels(
 
     national_level_calibrated_dataset = calibrate_single_geography_level(
         areas_in_national_level,
-        dataset="Dataset_state_level.h5",
+        dataset="Dataset_state_level_age_medicaid_snap_eitc_agi_targets.h5",
         stack_datasets=False,
-        db_uri="sqlite:///policy_data.db",
         noise_level=0.0,
         use_dataset_weights=True,
         regularize_with_l0=False,
@@ -530,7 +540,7 @@ def calibrate_all_levels(
 
     SingleYearDataset_to_Dataset(
         national_level_calibrated_dataset,
-        output_path="Dataset_national_level.h5",
+        output_path="Dataset_national_level_age_medicaid_snap_eitc_agi_targets.h5",
     )
 
     print("Completed calibration for national level dataset.")
diff --git a/src/policyengine_data/calibration/metrics_matrix_creation.py b/src/policyengine_data/calibration/metrics_matrix_creation.py
@@ -6,41 +6,9 @@
 from policyengine_us import Microsimulation
 from sqlalchemy import create_engine
 
-logger = logging.getLogger(__name__)
-
-
-def download_database(
-    filename: Optional[str] = "policy_data.db",
-    repo_id: Optional[str] = "policyengine/test",
-) -> create_engine:
-    """
-    Download the SQLite database from Hugging Face Hub and return the connection string.
-
-    Args:
-        filename: Optional name of the database file to download
-        repo_id: Optional Hugging Face repository ID where the database is stored
-
-    Returns:
-        Connection string for the SQLite database
-    """
-    import os
+from .target_rescaling import download_database
 
-    from huggingface_hub import hf_hub_download
-
-    # Download the file to the current working directory
-    try:
-        downloaded_path = hf_hub_download(
-            repo_id=repo_id,
-            filename=filename,
-            local_dir=".",  # Use "." for the current working directory
-            local_dir_use_symlinks=False,  # Recommended to avoid symlinks
-        )
-        path = os.path.abspath(downloaded_path)
-        logger.info(f"File downloaded successfully to: {path}")
-        return f"sqlite:///{path}"
-
-    except Exception as e:
-        raise ValueError(f"An error occurred: {e}")
+logger = logging.getLogger(__name__)
 
 
 # NOTE (juaristi22): This could fail if trying to filter by more than one stratum constraint if there are mismatches between the filtering variable, value and operation.
diff --git a/src/policyengine_data/calibration/target_rescaling.py b/src/policyengine_data/calibration/target_rescaling.py
@@ -14,7 +14,7 @@
 
 def download_database(
     filename: Optional[str] = "policy_data.db",
-    repo_id: Optional[str] = "policyengine/test",
+    repo_id: Optional[str] = "policyengine/policyengine-us-data",
 ) -> create_engine:
     """
     Download the SQLite database from Hugging Face Hub and return the connection string.
@@ -37,6 +37,7 @@ def download_database(
             filename=filename,
             local_dir="download/",
             local_dir_use_symlinks=False,  # Recommended to avoid symlinks
+            force_download=True,  # Always download, ignore cache
         )
         path = os.path.abspath(downloaded_path)
         logger.info(f"File downloaded successfully to: {path}")
diff --git a/src/policyengine_data/calibration/utils.py b/src/policyengine_data/calibration/utils.py
@@ -0,0 +1,84 @@
+"""
+Additional utilities for the calibration process.
+"""
+
+from typing import Dict, List
+
+import numpy as np
+import torch
+
+
+def create_geographic_normalization_factor(
+    geo_hierarchy: List[str],
+    target_info: Dict[int, Dict[str, any]],
+) -> torch.Tensor:
+    """
+    Create a normalization factor for the calibration process to balance targets that belong to different geographic areas or concepts.
+
+    Args:
+        geo_hierarchy (List[str]): Geographic hierarchy levels' codes (e.g., ["0100000US", "0400000US", "0500000US"]). Make sure to pass the part of the code general to all areas within a given level.
+        target_info (Dict[int, Dict[str, any]]): A dictionary containing information about each target, including its name which denotes geographic area and its active status.
+
+    Returns:
+        normalization_factor (torch.Tensor): Normalization factor for each active target.
+    """
+    is_active = []
+    geo_codes = []
+    geo_level_sum = {}
+
+    for code in geo_hierarchy:
+        geo_level_sum[code] = 0
+
+    # First pass: collect active status and geo codes for all targets
+    for target_id, info in target_info.items():
+        is_active.append(info["active"])
+        target_name = info["name"]
+        matched_geo = None
+
+        for code in geo_hierarchy:
+            if code in target_name:
+                matched_geo = code
+                if info["active"]:
+                    geo_level_sum[code] += 1
+                break
+
+        geo_codes.append(matched_geo)
+
+    is_active = torch.tensor(is_active, dtype=torch.float32)
+    normalization_factor = torch.zeros_like(is_active)
+
+    # Assign normalization factors based on geo level for each target
+    for i, (is_target_active, geo_code) in enumerate(
+        zip(is_active, geo_codes)
+    ):
+        if (
+            is_target_active
+            and geo_code is not None
+            and geo_level_sum[geo_code] > 0
+        ):
+            normalization_factor[i] = 1.0 / geo_level_sum[geo_code]
+
+    # Check if only one geographic level is represented among active targets
+    active_geo_levels = set()
+    for i, is_target_active in enumerate(is_active):
+        if is_target_active and geo_codes[i] is not None:
+            active_geo_levels.add(geo_codes[i])
+
+    # If no matching geo codes for active targets, return zeros for active targets
+    if len(active_geo_levels) == 0:
+        active_factors = torch.zeros(sum(is_active.bool()))
+        return active_factors
+
+    # If only one geographic level is present, return tensor of ones for active targets
+    if len(active_geo_levels) <= 1:
+        normalization_factor = torch.where(
+            is_active.bool(), torch.tensor(1.0), torch.tensor(0.0)
+        )
+    else:
+        # Apply mean normalization for multiple geographic levels
+        active_factors = normalization_factor[is_active.bool()]
+        if len(active_factors) > 0 and active_factors.sum() > 0:
+            inv_mean_norm = 1.0 / active_factors.mean()
+            normalization_factor = normalization_factor * inv_mean_norm
+
+    return normalization_factor[is_active.bool()]
diff --git a/tests/test_calibration/test_calibration.py b/tests/test_calibration/test_calibration.py
@@ -63,9 +63,6 @@
 }
 
 
-@pytest.mark.skip(
-    reason="Online database is not yet updated with necessary format."
-)
 def test_calibration_per_geographic_level_iteration():
     """
     Test and example of the calibration routine involving calibrating one geographic level at a time from lowest to highest in the hierarchy and generating sparsity in all but the last levels.
@@ -126,9 +123,6 @@ def test_calibration_per_geographic_level_iteration():
     ).sum() > 0, "Household weights do not differ between state and national levels, suggesting national calibration was unsucessful."
 
 
-@pytest.mark.skip(
-    reason="Online database is not yet updated with necessary format."
-)
 def test_calibration_combining_all_levels_at_once():
     """
     Test and example of the calibration routine involving stacking datasets at a single (most often lowest) geographic level for increased data richness and then calibrating said stacked dataset for all geographic levels at once.
@@ -147,6 +141,7 @@ def test_calibration_combining_all_levels_at_once():
         areas_in_state_level,
         "hf://policyengine/policyengine-us-data/cps_2023.h5",
         db_uri="sqlite:///policy_data.db",  # remove once online database is updated
+        geo_hierarchy=["0100000US", "0400000US"],
         dataset_subsample_size=2000,
         regularize_with_l0=True,
         raise_error=False,  # this will avoid raising an error if some targets have no records contributing to them (given sampling)
diff --git a/tests/test_calibration/test_matrix_creation.py b/tests/test_calibration/test_matrix_creation.py
@@ -7,9 +7,6 @@
 import pytest
 
 
-@pytest.mark.skip(
-    reason="Online database is not yet updated with necessary format."
-)
 def test_matrix_creation() -> None:
     from policyengine_data.calibration import (
         create_metrics_matrix,
@@ -28,19 +25,11 @@ def test_matrix_creation() -> None:
         reform_id=0,
     )
 
-    # Validate the matrix
+    # Validate the matrix (it will raise an error if matrix creation failed)
     validation_results = validate_metrics_matrix(
         metrics_matrix, target_values, target_info=target_info
     )
 
-    assert metrics_matrix.columns.tolist() == [
-        i for i in range(1, 937)
-    ], "Metrics matrix columns do not match expected target ids"
-    assert all(
-        validation_results[validation_results["target_id"] < 19]["estimate"]
-        != 0
-    ), "Metrics matrix should have all estimates non-zero for federal age targets"
-
 
 def test_parse_constraint_value():
     """Test parsing constraint values from strings."""
diff --git a/tests/test_calibration/test_normalization_factor.py b/tests/test_calibration/test_normalization_factor.py

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@ dependencies = [`
`19`	`19`	`"microcalibrate",`
`20`	`20`	`"sqlalchemy",`
`21`	`21`	`"huggingface_hub",`
	`22`	`+ "torch",`
`22`	`23`	`]`
`23`	`24`
`24`	`25`	`[project.optional-dependencies]`
Original file line number	Diff line number	Diff line change
`@@ -8,3 +8,4 @@`
`8`	`8`	`validate_metrics_matrix,`
`9`	`9`	`)`
`10`	`10`	`from .target_rescaling import download_database, rescale_calibration_targets`
	`11`	`+from .utils import create_geographic_normalization_factor`