Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.11"]
python-version: ["3.13"]

steps:
- name: Checkout repo
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.11"
python-version: "3.13"
- name: Install uv
uses: astral-sh/setup-uv@v5
- name: Install relevant dependencies
Expand All @@ -24,7 +24,7 @@ jobs:
strategy:
matrix:
os: [ ubuntu-latest ]
python-version: ["3.11"]
python-version: ["3.13"]
fail-fast: false
runs-on: ${{ matrix.os }}
steps:
Expand Down Expand Up @@ -58,7 +58,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.11"
python-version: "3.13"
- name: Install dependencies
run: |
uv pip install -e ".[dev,docs]" --system
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/versioning.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: 3.11
python-version: 3.13
- name: Build changelog
run: pip install yaml-changelog && make changelog
- name: Preview changelog update
Expand Down
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: patch
changes:
fixed:
- Logic to add calculated variables (not input variables) to SingleYearDataset when loading from a microsim.
22 changes: 12 additions & 10 deletions docs/calibration.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"id": "5a58bd2b",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -92,12 +92,12 @@
" db_uri=db_uri, update_database=True\n",
")\n",
"\n",
"# Uprate targets for consistency across definition year (disabled until IRS SOI variables are renamed to avoid errors)\n",
"# uprating_results = uprate_calibration_targets(\n",
"# system=system, db_uri=db_uri, \n",
"# from_period=2022, to_period=2023, \n",
"# update_database=True\n",
"# )"
"# Uprate targets for consistency across definition year\n",
"uprating_results = uprate_calibration_targets(\n",
" system=system, db_uri=db_uri, \n",
" from_period=2022, to_period=2023, \n",
" update_database=True\n",
")"
]
},
{
Expand All @@ -118,7 +118,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"id": "c75954d1",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -197,7 +197,8 @@
" dataset_subsample_size=10000, # Small sample for faster execution\n",
" use_dataset_weights=False, # Start with equal weights\n",
" regularize_with_l0=True, # Enable sparsity\n",
" noise_level=10.0\n",
" noise_level=10.0,\n",
" raise_error=False,\n",
")\n",
"\n",
"# Examine the results\n",
Expand Down Expand Up @@ -225,7 +226,8 @@
" stack_datasets=False, # Don't stack since we're using pre-stacked data\n",
" noise_level=0.0, # Minimal noise to preserve state calibration\n",
" use_dataset_weights=True, # Start from state-calibrated weights\n",
" regularize_with_l0=False # No sparsity at national level\n",
" regularize_with_l0=False, # No sparsity at national level\n",
" raise_error=False,\n",
")\n",
"\n",
"# Compare results\n",
Expand Down
17 changes: 15 additions & 2 deletions docs/dataset.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,17 @@
"#### Method 3: From a PolicyEngine MicroSimulation"
]
},
{
"cell_type": "markdown",
"id": "f7545881",
"metadata": {},
"source": [
"Note that, by default, the `from_simulation()` method only loads input variables, which come from the underlying dataset. This may create problems later on if when trying to calculate specific variables using the loaded SingleYearDataset without running the whole microsimulation. The `include_all_variables` parameter, when set to `True` will calculate all variables in the microsimulation object when loading the SingleYearDataset to avoid problems down the line. However, this is quite computationally heavy. Instead, the parameter `additional_variables` enables passing a dictionary of specific \"calculated\" variables to be loaded, avoiding computing variables that will not be needed."
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"id": "4f1a90f6",
"metadata": {},
"outputs": [
Expand All @@ -192,7 +200,12 @@
"\n",
"sim = Microsimulation(dataset=dataset)\n",
"\n",
"single_year_dataset = SingleYearDataset.from_simulation(sim, time_period=start_year)\n",
"single_year_dataset = SingleYearDataset.from_simulation(\n",
" sim, \n",
" time_period=start_year,\n",
" include_all_variables=False,\n",
" additional_variables={\"person\": [\"employment_income\"]},\n",
")\n",
"single_year_dataset.time_period = start_year\n",
"\n",
"print(f\"Dataset created from PolicyEngine US microdata stored in {dataset}\")\n",
Expand Down
10 changes: 5 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ readme = "README.md"
authors = [
{name = "PolicyEngine", email = "[email protected]"},
]
requires-python = ">=3.11"
requires-python = ">=3.13"
dependencies = [
"h5py",
"numpy",
"pandas",
"huggingface_hub>=0.25.1",
"tables",
"policyengine-core>=3.6.4",
"tables>=3.10.2",
"policyengine-core>=3.20.0",
"policyengine-us", # remove as soon as we fix UCGID
"microdf-python",
"microcalibrate",
Expand All @@ -32,7 +32,7 @@ dev = [
"build",
"linecheck",
"yaml-changelog>=0.1.7",
"policyengine-us>=1.366.0",
"policyengine-us==1.370.1",
]

docs = [
Expand Down Expand Up @@ -69,7 +69,7 @@ line_length = 79

[tool.black]
line-length = 79
target-version = ["py311"]
target-version = ["py313"]

[project.scripts]
policyengine-data = "policyengine_data:main"
Expand Down
55 changes: 42 additions & 13 deletions src/policyengine_data/calibration/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from policyengine_data import SingleYearDataset, normalise_table_keys
from policyengine_data.calibration.dataset_duplication import (
identify_calculated_variables,
load_dataset_for_geography_legacy,
minimize_calibrated_dataset_legacy,
)
Expand Down Expand Up @@ -45,6 +46,7 @@ def calibrate_single_geography_level(
year: Optional[int] = 2023,
db_uri: Optional[str] = None,
noise_level: Optional[float] = 10.0,
epochs: Optional[int] = 600,
use_dataset_weights: Optional[bool] = True,
regularize_with_l0: Optional[bool] = False,
calibration_log_path: Optional[str] = None,
Expand All @@ -71,6 +73,7 @@ def calibrate_single_geography_level(
geo_sim_filter_variable (str): The variable used to filter the simulation by geography. Default in the US: "ucgid".
db_uri (Optional[str]): The URI of the database to use for rescaling targets. If None, it will download the database from the default URI.
noise_level (Optional[float]): The level of noise to apply during calibration. Default: 10.0.
epochs (Optional[int]): The number of training epochs to use for calibration. Default: 600.
use_dataset_weights (Optional[bool]): Whether to use original dataset weights as the starting weights for calibration. Default: True.
regularize_with_l0 (Optional[bool]): Whether to use L0 regularization during calibration. Default: False.
calibration_log_path (Optional[str]): The path to the calibration log file. If None, calibration log CSVs will not be saved.
Expand All @@ -82,6 +85,14 @@ def calibrate_single_geography_level(
if db_uri is None:
db_uri = download_database()

# Identify calculated variables from the base dataset to preserve them
important_calculated_vars = identify_calculated_variables(
dataset, microsimulation_class
)
logger.info(
f"Identified calculated variables to preserve: {important_calculated_vars}"
)

geography_level_calibrated_dataset = None
for area, geo_identifier in calibration_areas.items():
logger.info(f"Calibrating dataset for {area}...")
Expand Down Expand Up @@ -148,7 +159,7 @@ def calibrate_single_geography_level(
targets=targets,
target_names=target_names,
estimate_matrix=metrics_matrix,
epochs=600,
epochs=epochs,
learning_rate=0.2,
noise_level=noise_level,
excluded_targets=(
Expand All @@ -172,6 +183,8 @@ def calibrate_single_geography_level(
if regularize_with_l0
else optimized_weights
),
include_all_variables=False, # Use important variables for efficiency
important_variables=important_calculated_vars,
)

# Detect ids that require resetting after minimization
Expand Down Expand Up @@ -249,6 +262,7 @@ def calibrate_all_levels(
year: Optional[int] = 2023,
db_uri: Optional[str] = None,
noise_level: Optional[float] = 10.0,
epochs: Optional[int] = 600,
regularize_with_l0: Optional[bool] = False,
raise_error: Optional[bool] = True,
) -> "SingleYearDataset":
Expand All @@ -272,6 +286,7 @@ def calibrate_all_levels(
year (Optional[int]): The year to use for calibration. Default: 2023.
db_uri (Optional[str]): The database URI to use for calibration. If None, it will download the database from the default URI.
noise_level (Optional[float]): The noise level to use for calibration. Default: 10.0.
epochs (Optional[int]): The number of training epochs to use for calibration. Default: 600.
regularize_with_l0 (Optional[bool]): Whether to use L0 regularization for calibration. Default: False.
raise_error (Optional[bool]): Whether to raise an error if matrix creation fails. Default: True.

Expand All @@ -281,6 +296,14 @@ def calibrate_all_levels(
if db_uri is None:
db_uri = download_database()

# Identify calculated variables from the base dataset to preserve them
important_calculated_vars = identify_calculated_variables(
dataset, microsimulation_class
)
logger.info(
f"Identified calculated variables to preserve: {important_calculated_vars}"
)

stacked_dataset = None
for area, geo_identifier in database_stacking_areas.items():
logger.info(f"Stacking dataset for {area}...")
Expand All @@ -304,6 +327,8 @@ def calibrate_all_levels(
single_year_dataset = SingleYearDataset.from_simulation(
simulation=sim_data_to_stack,
time_period=year,
include_all_variables=False, # Use important variables for efficiency
additional_variables=important_calculated_vars,
)

# Detect ids that require resetting
Expand Down Expand Up @@ -408,7 +433,7 @@ def calibrate_all_levels(
targets=targets,
target_names=target_names,
estimate_matrix=metrics_matrix,
epochs=600,
epochs=epochs,
learning_rate=0.2,
noise_level=noise_level,
excluded_targets=(
Expand Down Expand Up @@ -437,6 +462,8 @@ def calibrate_all_levels(
if regularize_with_l0
else optimized_weights
),
include_all_variables=False, # Use important variables for efficiency
important_variables=important_calculated_vars,
)

return fully_calibrated_dataset
Expand Down Expand Up @@ -513,14 +540,14 @@ def calibrate_all_levels(
db_uri=db_uri, update_database=True
)

# Uprate targets for consistency across definition year (disabled until IRS SOI variables are renamed to avoid errors)
# uprating_results = uprate_calibration_targets(
# system=system,
# db_uri=db_uri,
# from_period=2022,
# to_period=2023,
# update_database=True,
# )
# Uprate targets for consistency across definition year
uprating_results = uprate_calibration_targets(
system=system,
db_uri=db_uri,
from_period=2022,
to_period=2023,
update_database=True,
)

state_level_calibrated_dataset = calibrate_single_geography_level(
Microsimulation,
Expand All @@ -529,6 +556,7 @@ def calibrate_all_levels(
db_uri=db_uri,
use_dataset_weights=False,
regularize_with_l0=True,
raise_error=False,
)

state_level_weights = state_level_calibrated_dataset.entities["household"][
Expand All @@ -537,7 +565,7 @@ def calibrate_all_levels(

SingleYearDataset_to_Dataset(
state_level_calibrated_dataset,
output_path="Dataset_state_level_age_medicaid_snap_eitc_agi_targets.h5",
output_path="Dataset_state_level_Aug20.h5",
)

print("Completed calibration for state level dataset.")
Expand All @@ -550,12 +578,13 @@ def calibrate_all_levels(
national_level_calibrated_dataset = calibrate_single_geography_level(
Microsimulation,
areas_in_national_level,
dataset="Dataset_state_level_age_medicaid_snap_eitc_agi_targets.h5",
dataset="Dataset_state_level_Aug20.h5",
db_uri=db_uri,
stack_datasets=False,
noise_level=0.0,
use_dataset_weights=True,
regularize_with_l0=False,
raise_error=False,
)

national_level_weights = national_level_calibrated_dataset.entities[
Expand All @@ -564,7 +593,7 @@ def calibrate_all_levels(

SingleYearDataset_to_Dataset(
national_level_calibrated_dataset,
output_path="Dataset_national_level_age_medicaid_snap_eitc_agi_targets.h5",
output_path="Dataset_national_level_Aug20.h5",
)

print("Completed calibration for national level dataset.")
Expand Down
Loading