From fbb3cac8affd5fa0405488992d87f327e2eb98bf Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Wed, 20 Aug 2025 18:47:02 +0200 Subject: [PATCH 1/6] fix calculated variables being left out of SingleYearDataset objects --- changelog_entry.yaml | 4 + .../calibration/calibrate.py | 49 +++- .../calibration/dataset_duplication.py | 68 ++++- src/policyengine_data/single_year_dataset.py | 85 +++++- .../tools/legacy_class_conversions.py | 104 ++++--- .../test_calculated_variables_preservation.py | 276 ++++++++++++++++++ 6 files changed, 525 insertions(+), 61 deletions(-) create mode 100644 tests/test_calibration/test_calculated_variables_preservation.py diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29..fd0139c 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: patch + changes: + fixed: + - Logic to add calculated variables (not input variables) to SingleYearDataset when loading from a microsim. diff --git a/src/policyengine_data/calibration/calibrate.py b/src/policyengine_data/calibration/calibrate.py index 072ee61..6ece759 100644 --- a/src/policyengine_data/calibration/calibrate.py +++ b/src/policyengine_data/calibration/calibrate.py @@ -10,6 +10,7 @@ from policyengine_data import SingleYearDataset, normalise_table_keys from policyengine_data.calibration.dataset_duplication import ( + identify_calculated_variables, load_dataset_for_geography_legacy, minimize_calibrated_dataset_legacy, ) @@ -82,6 +83,14 @@ def calibrate_single_geography_level( if db_uri is None: db_uri = download_database() + # Identify calculated variables from the base dataset to preserve them + important_calculated_vars = identify_calculated_variables( + dataset, microsimulation_class + ) + logger.info( + f"Identified calculated variables to preserve: {important_calculated_vars}" + ) + geography_level_calibrated_dataset = None for area, geo_identifier in calibration_areas.items(): logger.info(f"Calibrating dataset for {area}...") @@ -156,7 +165,7 @@ def calibrate_single_geography_level( ), sparse_learning_rate=0.1, regularize_with_l0=regularize_with_l0, - csv_path=calibration_log_path, + csv_path=f"{area}_calibration.csv", ) performance_log = calibrator.calibrate() optimized_sparse_weights = calibrator.sparse_weights @@ -172,6 +181,8 @@ def calibrate_single_geography_level( if regularize_with_l0 else optimized_weights ), + include_all_variables=False, # Use important variables for efficiency + important_variables=important_calculated_vars, ) # Detect ids that require resetting after minimization @@ -281,6 +292,14 @@ def calibrate_all_levels( if db_uri is None: db_uri = download_database() + # Identify calculated variables from the base dataset to preserve them + important_calculated_vars = identify_calculated_variables( + dataset, microsimulation_class + ) + logger.info( + f"Identified calculated variables to preserve: {important_calculated_vars}" + ) + stacked_dataset = None for area, geo_identifier in database_stacking_areas.items(): logger.info(f"Stacking dataset for {area}...") @@ -304,6 +323,8 @@ def calibrate_all_levels( single_year_dataset = SingleYearDataset.from_simulation( simulation=sim_data_to_stack, time_period=year, + include_all_variables=False, # Use important variables for efficiency + additional_variables=important_calculated_vars, ) # Detect ids that require resetting @@ -437,6 +458,8 @@ def calibrate_all_levels( if regularize_with_l0 else optimized_weights ), + include_all_variables=False, # Use important variables for efficiency + important_variables=important_calculated_vars, ) return fully_calibrated_dataset @@ -513,14 +536,14 @@ def calibrate_all_levels( db_uri=db_uri, update_database=True ) - # Uprate targets for consistency across definition year (disabled until IRS SOI variables are renamed to avoid errors) - # uprating_results = uprate_calibration_targets( - # system=system, - # db_uri=db_uri, - # from_period=2022, - # to_period=2023, - # update_database=True, - # ) + # Uprate targets for consistency across definition year + uprating_results = uprate_calibration_targets( + system=system, + db_uri=db_uri, + from_period=2022, + to_period=2023, + update_database=True, + ) state_level_calibrated_dataset = calibrate_single_geography_level( Microsimulation, @@ -529,6 +552,7 @@ def calibrate_all_levels( db_uri=db_uri, use_dataset_weights=False, regularize_with_l0=True, + raise_error=False, ) state_level_weights = state_level_calibrated_dataset.entities["household"][ @@ -537,7 +561,7 @@ def calibrate_all_levels( SingleYearDataset_to_Dataset( state_level_calibrated_dataset, - output_path="Dataset_state_level_age_medicaid_snap_eitc_agi_targets.h5", + output_path="Dataset_state_level_Aug20.h5", ) print("Completed calibration for state level dataset.") @@ -550,12 +574,13 @@ def calibrate_all_levels( national_level_calibrated_dataset = calibrate_single_geography_level( Microsimulation, areas_in_national_level, - dataset="Dataset_state_level_age_medicaid_snap_eitc_agi_targets.h5", + dataset="Dataset_state_level_Aug20.h5", db_uri=db_uri, stack_datasets=False, noise_level=0.0, use_dataset_weights=True, regularize_with_l0=False, + raise_error=False, ) national_level_weights = national_level_calibrated_dataset.entities[ @@ -564,7 +589,7 @@ def calibrate_all_levels( SingleYearDataset_to_Dataset( national_level_calibrated_dataset, - output_path="Dataset_national_level_age_medicaid_snap_eitc_agi_targets.h5", + output_path="Dataset_national_level_Aug20.h5", ) print("Completed calibration for national level dataset.") diff --git a/src/policyengine_data/calibration/dataset_duplication.py b/src/policyengine_data/calibration/dataset_duplication.py index 4eca09b..9a2846e 100644 --- a/src/policyengine_data/calibration/dataset_duplication.py +++ b/src/policyengine_data/calibration/dataset_duplication.py @@ -14,6 +14,57 @@ """ +def identify_calculated_variables( + dataset_path: str, + microsimulation_class, +) -> dict: + """ + Identify calculated variables in a dataset by comparing with input variables. + + Args: + dataset_path: Path to the dataset file (e.g., "cps_2023.h5") + microsimulation_class: The Microsimulation class to get input variables from + + Returns: + Dict mapping entity names to lists of calculated variables in the dataset + E.g., {"person": ["employment_income", "self_employment_income"], ...} + """ + import h5py + + # Load microsimulation to get input variables + sim = microsimulation_class(dataset=dataset_path) + input_vars = set(sim.input_variables) + + # Get all variables from the dataset file + calculated_by_entity = {} + + # Handle HuggingFace URLs by using the sim's loaded dataset + if dataset_path.startswith("hf://"): + # Get the actual file path from the simulation's dataset + actual_path = sim.dataset.file_path + else: + actual_path = dataset_path + + with h5py.File(actual_path, "r") as f: + dataset_variables = set(f.keys()) + # Find calculated variables (in dataset but not input variables) + calculated_vars = dataset_variables - input_vars + + # Organize by entity + for var in calculated_vars: + if var in sim.tax_benefit_system.variables: + entity = sim.tax_benefit_system.variables[var].entity.key + if entity not in calculated_by_entity: + calculated_by_entity[entity] = [] + calculated_by_entity[entity].append(var) + + # Sort variables within each entity for consistency + for entity in calculated_by_entity: + calculated_by_entity[entity].sort() + + return calculated_by_entity + + def load_dataset_for_geography_legacy( microsimulation_class, year: Optional[int] = 2023, @@ -92,7 +143,12 @@ def load_dataset_for_geography_legacy( def minimize_calibrated_dataset_legacy( - microsimulation_class, sim, year: int, optimized_weights: pd.Series + microsimulation_class, + sim, + year: int, + optimized_weights: pd.Series, + include_all_variables: bool = False, + important_variables: list = None, ) -> "SingleYearDataset": """ Use sparse weights to minimize the calibrated dataset storing in the legacy Dataset class. @@ -102,6 +158,8 @@ def minimize_calibrated_dataset_legacy( sim: The Microsimulation object with the dataset to minimize. year (int): Year the dataset is representing. optimized_weights (pd.Series): The calibrated, regularized weights used to minimize the dataset. + include_all_variables (bool): If True, include ALL variables (both input and calculated). If False, include only input variables plus important calculated ones from important_variables if not None. + important_variables (list): List of important calculated variables to include if include_all_variables is False. Returns: SingleYearDataset: The regularized dataset @@ -162,7 +220,13 @@ def minimize_calibrated_dataset_legacy( sim.default_input_period = year sim.build_from_dataset() - single_year_dataset = SingleYearDataset.from_simulation(sim, year) + # Create SingleYearDataset using the from_simulation method + single_year_dataset = SingleYearDataset.from_simulation( + sim, + year, + include_all_variables=include_all_variables, + additional_variables=important_variables, + ) return single_year_dataset diff --git a/src/policyengine_data/single_year_dataset.py b/src/policyengine_data/single_year_dataset.py index 7639b35..510164b 100644 --- a/src/policyengine_data/single_year_dataset.py +++ b/src/policyengine_data/single_year_dataset.py @@ -112,30 +112,89 @@ def from_simulation( simulation: "Microsimulation", time_period: int = 2025, entity_names_to_include: Optional[List[str]] = None, + include_all_variables: bool = False, + additional_variables: Optional[Dict[str, List[str]]] = None, ) -> "SingleYearDataset": + """ + Create a SingleYearDataset from a Microsimulation. + + Args: + simulation: The Microsimulation to extract data from + time_period: The time period for the data + entity_names_to_include: Specific entities to include (None = all) + include_all_variables: If True, include ALL variables (both input and calculated) + additional_variables: Dict mapping entity names to lists of additional variables to include beyond input variables. + E.g., {"person": ["employment_income"], "tax_unit": ["eitc", "adjusted_gross_income"]} + + Returns: + SingleYearDataset with the specified variables + """ entity_dfs = {} - # If no entity names specified, use all available entities - if entity_names_to_include is None: + # Determine which entities to process + if include_all_variables: + # When including all variables, get entities from all variables entity_names = list( set( simulation.tax_benefit_system.variables[var].entity.key - for var in simulation.input_variables + for var in simulation.tax_benefit_system.variables ) ) - else: + elif entity_names_to_include is not None: entity_names = entity_names_to_include + else: + # Default: get entities from input variables + entity_names = list( + set( + simulation.tax_benefit_system.variables[var].entity.key + for var in simulation.input_variables + ) + ) + # Process each entity for entity in entity_names: - input_variables = [ - variable - for variable in simulation.input_variables - if simulation.tax_benefit_system.variables[variable].entity.key - == entity - ] - entity_dfs[entity] = simulation.calculate_dataframe( - input_variables, period=time_period - ) + variables_to_include = [] + + if include_all_variables: + # Get ALL variables for this entity (already filtered by entity) + variables_to_include = [ + var_name + for var_name in simulation.tax_benefit_system.variables + if simulation.tax_benefit_system.variables[ + var_name + ].entity.key + == entity + ] + else: + # Start with input variables for this entity + variables_to_include = [ + variable + for variable in simulation.input_variables + if simulation.tax_benefit_system.variables[ + variable + ].entity.key + == entity + ] + + # Add any additional specified variables for this entity + if additional_variables and entity in additional_variables: + for var in additional_variables[entity]: + # Verify the variable exists, belongs to this entity, and isn't already included + if ( + var in simulation.tax_benefit_system.variables + and simulation.tax_benefit_system.variables[ + var + ].entity.key + == entity + and var not in variables_to_include + ): + variables_to_include.append(var) + + # Calculate all variables for this entity (all should belong to the entity now) + if variables_to_include: + entity_dfs[entity] = simulation.calculate_dataframe( + variables_to_include, period=time_period + ) return SingleYearDataset( entities=entity_dfs, diff --git a/src/policyengine_data/tools/legacy_class_conversions.py b/src/policyengine_data/tools/legacy_class_conversions.py index c0fb17a..3eae623 100644 --- a/src/policyengine_data/tools/legacy_class_conversions.py +++ b/src/policyengine_data/tools/legacy_class_conversions.py @@ -7,6 +7,7 @@ import h5py import numpy as np +import pandas as pd from ..single_year_dataset import SingleYearDataset @@ -20,8 +21,7 @@ def SingleYearDataset_to_Dataset( Convert a SingleYearDataset to legacy Dataset format and save as h5 file. This function loads entity tables from a SingleYearDataset, separates them into - variable arrays, and saves them in the legacy ARRAYS format used - by the legacy Dataset class. + variable arrays, and saves them in the flat ARRAYS format expected by PolicyEngine. Args: dataset: SingleYearDataset instance with entity tables @@ -34,40 +34,76 @@ def SingleYearDataset_to_Dataset( output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) - # Convert entity tables to variable arrays dictionary with proper type handling - variable_arrays = {} + # Save in flat ARRAYS format (all variables as datasets at root level) + with h5py.File(output_path, "w") as f: + for entity_name, entity_df in dataset.entities.items(): + # Process each column as a variable + for column_name in entity_df.columns: + values = entity_df[column_name].values - for entity_name, entity_df in dataset.entities.items(): - # Extract each column as a separate variable array - for column_name in entity_df.columns: - values = entity_df[column_name].values + # Handle special data type conversions + if values.dtype == object: + try: + # Try to convert to appropriate type + if column_name in [ + "state_name", + "state_code", + "state_code_str", + ]: + # String columns - encode as fixed-length strings + max_len = max( + len(str(v)) for v in values if v is not None + ) + values = np.array( + [ + str(v) if v is not None else "" + for v in values + ], + dtype=f"S{max_len}", + ) + elif column_name == "county_fips": + values = values.astype("int32") + else: + # Try numeric conversion first + try: + values = pd.to_numeric(values, errors="raise") + # Keep integers as integers for certain variables + if column_name.endswith( + "_id" + ) or column_name in ["age", "count", "year"]: + values = values.astype("int64") + else: + values = values.astype("float64") + except: + # Fall back to string + values = np.array( + [str(v).encode() for v in values], + dtype="S", + ) + except Exception as e: + # Final fallback + values = np.array( + [str(v).encode() for v in values], dtype="S" + ) - # Handle special data type conversions following CPS pattern - if values.dtype == object: - # Try to determine if this should be string or numeric - try: - # Check if it's actually string data that should be encoded - if hasattr(values, "decode_to_str"): - values = values.decode_to_str().astype("S") - elif column_name == "county_fips": - values = values.astype("int32") + # Convert bool to int + elif values.dtype == bool: + values = values.astype("int64") + + # Preserve integer types for ID variables + elif np.issubdtype(values.dtype, np.integer): + if column_name.endswith("_id"): + values = values.astype("int64") else: - # For other object types, try to preserve as string - values = np.array(values, dtype="S") - except: - # Fallback: convert to string - values = np.array( - [str(v).encode() for v in values], dtype="S" - ) + values = values.astype("float64") - variable_arrays[column_name] = values + # Use float64 for other numeric types (matching CPS format) + elif np.issubdtype(values.dtype, np.floating): + values = values.astype("float64") - # Save in ARRAYS format (direct variable datasets) - with h5py.File(output_path, "w") as f: - for variable_name, values in variable_arrays.items(): - try: - # Store each variable directly as a dataset (no time period grouping) - f.create_dataset(variable_name, data=values) - except Exception as e: - print(f" Warning: Could not save {variable_name}: {e}") - continue + try: + # Store variable directly at root level (flat structure) + f.create_dataset(column_name, data=values) + except Exception as e: + print(f" Warning: Could not save {column_name}: {e}") + continue diff --git a/tests/test_calibration/test_calculated_variables_preservation.py b/tests/test_calibration/test_calculated_variables_preservation.py new file mode 100644 index 0000000..780aa94 --- /dev/null +++ b/tests/test_calibration/test_calculated_variables_preservation.py @@ -0,0 +1,276 @@ +""" +Test that calculated variables are properly identified and preserved during dataset minimization. +""" + +import numpy as np +import pandas as pd +from policyengine_us import Microsimulation +from policyengine_data.calibration.dataset_duplication import ( + identify_calculated_variables, + minimize_calibrated_dataset_legacy, +) + + +class TestCalculatedVariablesPreservation: + """Test suite for verifying calculated variables are preserved during dataset operations.""" + + def test_identify_calculated_variables_cps(self): + """Test that identify_calculated_variables correctly identifies calculated vars in CPS.""" + # Identify calculated variables in CPS dataset + calculated_vars = identify_calculated_variables( + "hf://policyengine/policyengine-us-data/cps_2023.h5", + Microsimulation, + ) + + # CPS should have these calculated variables + assert "person" in calculated_vars + assert "employment_income" in calculated_vars["person"] + assert "self_employment_income" in calculated_vars["person"] + assert "weekly_hours_worked" in calculated_vars["person"] + + # Should have exactly 3 person-level calculated variables + assert len(calculated_vars.get("person", [])) == 3 + + def test_minimize_preserves_calculated_variables(self): + """Test that minimize_calibrated_dataset_legacy preserves calculated variables and their values.""" + # Load CPS dataset + sim = Microsimulation( + dataset="hf://policyengine/policyengine-us-data/cps_2023.h5" + ) + sim.default_period = 2023 + + # Get original values for calculated variables + orig_employment_income = sim.calculate( + "employment_income", 2023 + ).values + orig_self_employment_income = sim.calculate( + "self_employment_income", 2023 + ).values + orig_weekly_hours = sim.calculate("weekly_hours_worked", 2023).values + + # Store original statistics + orig_emp_sum = orig_employment_income.sum() + orig_emp_nonzero_count = (orig_employment_income > 0).sum() + orig_self_emp_sum = orig_self_employment_income.sum() + orig_self_emp_nonzero_count = (orig_self_employment_income > 0).sum() + orig_hours_sum = orig_weekly_hours.sum() + orig_hours_nonzero_count = (orig_weekly_hours > 0).sum() + + # Verify we have non-zero values to start with + assert ( + orig_emp_sum > 0 + ), "Original employment income should have non-zero values" + assert ( + orig_emp_nonzero_count > 0 + ), "Should have people with employment income" + assert ( + orig_self_emp_sum > 0 + ), "Original self-employment income should have non-zero values" + assert ( + orig_self_emp_nonzero_count > 0 + ), "Should have people with self-employment income" + + # Create a subset with some households (use original weights for subset) + household_ids = sim.calculate("household_id", 2023).values + unique_hh_ids = np.unique(household_ids)[ + :100 + ] # Take first 100 households + orig_weights = sim.calculate("household_weight", 2023).values + + # Create subset weights + subset_weights = np.zeros_like(orig_weights) + for hh_id in unique_hh_ids: + mask = household_ids == hh_id + subset_weights[mask] = orig_weights[mask] + + # Identify calculated variables + calculated_vars = identify_calculated_variables( + "hf://policyengine/policyengine-us-data/cps_2023.h5", + Microsimulation, + ) + + # Minimize the dataset preserving calculated variables + minimized_dataset = minimize_calibrated_dataset_legacy( + Microsimulation, + sim, + 2023, + pd.Series(subset_weights), + include_all_variables=False, + important_variables=calculated_vars, + ) + + # Verify the minimized dataset has the right structure + assert "person" in minimized_dataset.entities + person_df = minimized_dataset.entities["person"] + + # Check that calculated variables are present + assert ( + "employment_income" in person_df.columns + ), "employment_income should be preserved" + assert ( + "self_employment_income" in person_df.columns + ), "self_employment_income should be preserved" + assert ( + "weekly_hours_worked" in person_df.columns + ), "weekly_hours_worked should be preserved" + + # Check that values are not all zero + min_emp_income = person_df["employment_income"].values + min_self_emp_income = person_df["self_employment_income"].values + min_weekly_hours = person_df["weekly_hours_worked"].values + + assert ( + min_emp_income.sum() > 0 + ), "Minimized employment income should not be all zeros" + assert ( + min_emp_income > 0 + ).sum() > 0, "Should have some non-zero employment income" + + assert ( + min_self_emp_income.sum() > 0 + ), "Minimized self-employment income should not be all zeros" + assert ( + min_self_emp_income > 0 + ).sum() > 0, "Should have some non-zero self-employment income" + + assert ( + min_weekly_hours.sum() > 0 + ), "Minimized weekly hours should not be all zeros" + assert ( + min_weekly_hours > 0 + ).sum() > 0, "Should have some non-zero weekly hours" + + # Verify input variables are also present + assert ( + "person_id" in person_df.columns + ), "ID variables should be preserved" + assert ( + "age" in person_df.columns + ), "Input variables like age should be preserved" + + def test_minimize_with_all_variables(self): + """Test that minimize_calibrated_dataset_legacy works with include_all_variables=True.""" + # Load CPS dataset + sim = Microsimulation( + dataset="hf://policyengine/policyengine-us-data/cps_2023.h5" + ) + sim.default_period = 2023 + + # Create a small subset for speed + household_ids = sim.calculate("household_id", 2023).values + unique_hh_ids = np.unique(household_ids)[ + :20 + ] # Just 20 households for all variables test + orig_weights = sim.calculate("household_weight", 2023).values + + subset_weights = np.zeros_like(orig_weights) + for hh_id in unique_hh_ids: + mask = household_ids == hh_id + subset_weights[mask] = orig_weights[mask] + + # Minimize with ALL variables + minimized_dataset = minimize_calibrated_dataset_legacy( + Microsimulation, + sim, + 2023, + pd.Series(subset_weights), + include_all_variables=True, # Include everything + important_variables=None, + ) + + # Should have many more variables + person_df = minimized_dataset.entities["person"] + assert ( + len(person_df.columns) > 100 + ), "Should have many variables when include_all=True" + + # Key calculated variables should still be there and non-zero + assert "employment_income" in person_df.columns + assert ( + person_df["employment_income"].sum() > 0 or len(person_df) == 0 + ) # Allow for empty subset + + def test_calculated_variables_consistency(self): + """Test that calculated variable sums are non-zero after minimization.""" + # Load CPS dataset + sim = Microsimulation( + dataset="hf://policyengine/policyengine-us-data/cps_2023.h5" + ) + sim.default_period = 2023 + + # Get original calculated values + orig_employment_income = sim.calculate( + "employment_income", 2023 + ).values + orig_self_employment_income = sim.calculate( + "self_employment_income", 2023 + ).values + orig_weekly_hours = sim.calculate("weekly_hours_worked", 2023).values + + # Verify we have data diversity to start with + assert ( + orig_employment_income.sum() > 0 + ), "Original employment income should have non-zero sum" + assert ( + orig_self_employment_income.sum() > 0 + ), "Original self-employment income should have non-zero sum" + assert ( + orig_weekly_hours.sum() > 0 + ), "Original weekly hours should have non-zero sum" + + # Create a subset with some households + household_ids = sim.calculate("household_id", 2023).values + unique_hh_ids = np.unique(household_ids)[ + :100 + ] # Take first 100 households + orig_weights = sim.calculate("household_weight", 2023).values + + # Create subset weights + subset_weights = np.zeros_like(orig_weights) + for hh_id in unique_hh_ids: + mask = household_ids == hh_id + subset_weights[mask] = orig_weights[mask] + + # Identify calculated variables + calculated_vars = identify_calculated_variables( + "hf://policyengine/policyengine-us-data/cps_2023.h5", + Microsimulation, + ) + + # Minimize the dataset + minimized_dataset = minimize_calibrated_dataset_legacy( + Microsimulation, + sim, + 2023, + pd.Series(subset_weights), + include_all_variables=False, + important_variables=calculated_vars, + ) + + # Check that calculated variables have non-zero sums (data diversity) + person_df = minimized_dataset.entities["person"] + + assert ( + "employment_income" in person_df.columns + ), "employment_income should be preserved" + assert ( + "self_employment_income" in person_df.columns + ), "self_employment_income should be preserved" + assert ( + "weekly_hours_worked" in person_df.columns + ), "weekly_hours_worked should be preserved" + + # Check for data diversity - sums should not be zero + min_emp_income_sum = person_df["employment_income"].sum() + min_self_emp_sum = person_df["self_employment_income"].sum() + min_hours_sum = person_df["weekly_hours_worked"].sum() + + assert ( + min_emp_income_sum > 0 + ), "Minimized employment income sum should not be zero - ensuring data diversity" + assert ( + min_self_emp_sum > 0 + ), "Minimized self-employment income sum should not be zero - ensuring data diversity" + assert ( + min_hours_sum > 0 + ), "Minimized weekly hours sum should not be zero - ensuring data diversity" From f121ac3974a63c21c4193442e86026e594d3dd0c Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Wed, 20 Aug 2025 22:17:48 +0200 Subject: [PATCH 2/6] tests are passing locally not sure why --- docs/calibration.ipynb | 14 +++++++------- src/policyengine_data/calibration/calibrate.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/calibration.ipynb b/docs/calibration.ipynb index 47a50f7..adde17c 100644 --- a/docs/calibration.ipynb +++ b/docs/calibration.ipynb @@ -59,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "5a58bd2b", "metadata": {}, "outputs": [ @@ -92,12 +92,12 @@ " db_uri=db_uri, update_database=True\n", ")\n", "\n", - "# Uprate targets for consistency across definition year (disabled until IRS SOI variables are renamed to avoid errors)\n", - "# uprating_results = uprate_calibration_targets(\n", - "# system=system, db_uri=db_uri, \n", - "# from_period=2022, to_period=2023, \n", - "# update_database=True\n", - "# )" + "# Uprate targets for consistency across definition year\n", + "uprating_results = uprate_calibration_targets(\n", + " system=system, db_uri=db_uri, \n", + " from_period=2022, to_period=2023, \n", + " update_database=True\n", + ")" ] }, { diff --git a/src/policyengine_data/calibration/calibrate.py b/src/policyengine_data/calibration/calibrate.py index 6ece759..f4fd9c8 100644 --- a/src/policyengine_data/calibration/calibrate.py +++ b/src/policyengine_data/calibration/calibrate.py @@ -165,7 +165,7 @@ def calibrate_single_geography_level( ), sparse_learning_rate=0.1, regularize_with_l0=regularize_with_l0, - csv_path=f"{area}_calibration.csv", + csv_path=calibration_log_path, ) performance_log = calibrator.calibrate() optimized_sparse_weights = calibrator.sparse_weights From e0768b1ed16971e6e7c09aa57fe905a5071c912e Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Thu, 21 Aug 2025 09:55:56 +0200 Subject: [PATCH 3/6] try freezing -us version --- docs/calibration.ipynb | 6 ++++-- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/calibration.ipynb b/docs/calibration.ipynb index adde17c..88571cb 100644 --- a/docs/calibration.ipynb +++ b/docs/calibration.ipynb @@ -118,7 +118,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "c75954d1", "metadata": {}, "outputs": [ @@ -197,7 +197,8 @@ " dataset_subsample_size=10000, # Small sample for faster execution\n", " use_dataset_weights=False, # Start with equal weights\n", " regularize_with_l0=True, # Enable sparsity\n", - " noise_level=10.0\n", + " noise_level=10.0,\n", + " raise_error=False,\n", ")\n", "\n", "# Examine the results\n", @@ -226,6 +227,7 @@ " noise_level=0.0, # Minimal noise to preserve state calibration\n", " use_dataset_weights=True, # Start from state-calibrated weights\n", " regularize_with_l0=False # No sparsity at national level\n", + " raise_error=False,\n", ")\n", "\n", "# Compare results\n", diff --git a/pyproject.toml b/pyproject.toml index 1a1452e..c70e1bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dev = [ "build", "linecheck", "yaml-changelog>=0.1.7", - "policyengine-us>=1.366.0", + "policyengine-us==1.370.1", ] docs = [ From 0692cb850475df8cac435b38e5bb1cbf9ff0cd32 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Thu, 21 Aug 2025 10:09:49 +0200 Subject: [PATCH 4/6] attempting moving to 3.13 --- .github/workflows/main.yml | 2 +- .github/workflows/pr.yaml | 6 +++--- .github/workflows/versioning.yaml | 2 +- docs/calibration.ipynb | 2 +- pyproject.toml | 8 ++++---- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 56aed00..32b8fcf 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.11"] + python-version: ["3.13"] steps: - name: Checkout repo diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 7d0023a..9fc7e29 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -11,7 +11,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: "3.11" + python-version: "3.13" - name: Install uv uses: astral-sh/setup-uv@v5 - name: Install relevant dependencies @@ -24,7 +24,7 @@ jobs: strategy: matrix: os: [ ubuntu-latest ] - python-version: ["3.11"] + python-version: ["3.13"] fail-fast: false runs-on: ${{ matrix.os }} steps: @@ -58,7 +58,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: "3.11" + python-version: "3.13" - name: Install dependencies run: | uv pip install -e ".[dev,docs]" --system diff --git a/.github/workflows/versioning.yaml b/.github/workflows/versioning.yaml index c16790a..89647c3 100644 --- a/.github/workflows/versioning.yaml +++ b/.github/workflows/versioning.yaml @@ -25,7 +25,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: 3.11 + python-version: 3.13 - name: Build changelog run: pip install yaml-changelog && make changelog - name: Preview changelog update diff --git a/docs/calibration.ipynb b/docs/calibration.ipynb index 88571cb..371b8fb 100644 --- a/docs/calibration.ipynb +++ b/docs/calibration.ipynb @@ -226,7 +226,7 @@ " stack_datasets=False, # Don't stack since we're using pre-stacked data\n", " noise_level=0.0, # Minimal noise to preserve state calibration\n", " use_dataset_weights=True, # Start from state-calibrated weights\n", - " regularize_with_l0=False # No sparsity at national level\n", + " regularize_with_l0=False, # No sparsity at national level\n", " raise_error=False,\n", ")\n", "\n", diff --git a/pyproject.toml b/pyproject.toml index c70e1bf..e686469 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,14 +6,14 @@ readme = "README.md" authors = [ {name = "PolicyEngine", email = "hello@policyengine.org"}, ] -requires-python = ">=3.11" +requires-python = ">=3.13" dependencies = [ "h5py", "numpy", "pandas", "huggingface_hub>=0.25.1", - "tables", - "policyengine-core>=3.6.4", + "tables>=3.10.2", + "policyengine-core>=3.20.0", "policyengine-us", # remove as soon as we fix UCGID "microdf-python", "microcalibrate", @@ -69,7 +69,7 @@ line_length = 79 [tool.black] line-length = 79 -target-version = ["py311"] +target-version = ["py313"] [project.scripts] policyengine-data = "policyengine_data:main" From 6c14614ecc37c462ed3d56934f8e02d5fa0c5aad Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Thu, 21 Aug 2025 10:51:27 +0200 Subject: [PATCH 5/6] reducing epochs for faster testing --- .../calibration/calibrate.py | 8 +++- tests/test_calibration/test_calibration.py | 41 ++++++++++--------- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/policyengine_data/calibration/calibrate.py b/src/policyengine_data/calibration/calibrate.py index f4fd9c8..6b57deb 100644 --- a/src/policyengine_data/calibration/calibrate.py +++ b/src/policyengine_data/calibration/calibrate.py @@ -46,6 +46,7 @@ def calibrate_single_geography_level( year: Optional[int] = 2023, db_uri: Optional[str] = None, noise_level: Optional[float] = 10.0, + epochs: Optional[int] = 600, use_dataset_weights: Optional[bool] = True, regularize_with_l0: Optional[bool] = False, calibration_log_path: Optional[str] = None, @@ -72,6 +73,7 @@ def calibrate_single_geography_level( geo_sim_filter_variable (str): The variable used to filter the simulation by geography. Default in the US: "ucgid". db_uri (Optional[str]): The URI of the database to use for rescaling targets. If None, it will download the database from the default URI. noise_level (Optional[float]): The level of noise to apply during calibration. Default: 10.0. + epochs (Optional[int]): The number of training epochs to use for calibration. Default: 600. use_dataset_weights (Optional[bool]): Whether to use original dataset weights as the starting weights for calibration. Default: True. regularize_with_l0 (Optional[bool]): Whether to use L0 regularization during calibration. Default: False. calibration_log_path (Optional[str]): The path to the calibration log file. If None, calibration log CSVs will not be saved. @@ -157,7 +159,7 @@ def calibrate_single_geography_level( targets=targets, target_names=target_names, estimate_matrix=metrics_matrix, - epochs=600, + epochs=epochs, learning_rate=0.2, noise_level=noise_level, excluded_targets=( @@ -260,6 +262,7 @@ def calibrate_all_levels( year: Optional[int] = 2023, db_uri: Optional[str] = None, noise_level: Optional[float] = 10.0, + epochs: Optional[int] = 600, regularize_with_l0: Optional[bool] = False, raise_error: Optional[bool] = True, ) -> "SingleYearDataset": @@ -283,6 +286,7 @@ def calibrate_all_levels( year (Optional[int]): The year to use for calibration. Default: 2023. db_uri (Optional[str]): The database URI to use for calibration. If None, it will download the database from the default URI. noise_level (Optional[float]): The noise level to use for calibration. Default: 10.0. + epochs (Optional[int]): The number of training epochs to use for calibration. Default: 600. regularize_with_l0 (Optional[bool]): Whether to use L0 regularization for calibration. Default: False. raise_error (Optional[bool]): Whether to raise an error if matrix creation fails. Default: True. @@ -429,7 +433,7 @@ def calibrate_all_levels( targets=targets, target_names=target_names, estimate_matrix=metrics_matrix, - epochs=600, + epochs=epochs, learning_rate=0.2, noise_level=noise_level, excluded_targets=( diff --git a/tests/test_calibration/test_calibration.py b/tests/test_calibration/test_calibration.py index 1fecc92..0330039 100644 --- a/tests/test_calibration/test_calibration.py +++ b/tests/test_calibration/test_calibration.py @@ -2,8 +2,6 @@ Test the calibration logic for different geographic levels that integrates all other calibration pipeline components. """ -import pytest - areas_in_national_level = { "United States": "0100000US", } @@ -70,6 +68,7 @@ def test_calibration_per_geographic_level_iteration(): Conversion between dataset class types is necessary until full migration to the new SingleYearDataset class in the policyengine_core repository. """ from policyengine_us import Microsimulation + from policyengine_us.system import system from policyengine_data.tools.legacy_class_conversions import ( SingleYearDataset_to_Dataset, ) @@ -91,14 +90,14 @@ def test_calibration_per_geographic_level_iteration(): db_uri=db_uri, update_database=True ) - # Uprate targets for consistency across definition year (disabled until IRS SOI variables are renamed to avoid errors) - # uprating_results = uprate_calibration_targets( - # system=system, - # db_uri=db_uri, - # from_period=2022, - # to_period=2023, - # update_database=True, - # ) + # Uprate targets for consistency across definition year + uprating_results = uprate_calibration_targets( + system=system, + db_uri=db_uri, + from_period=2022, + to_period=2023, + update_database=True, + ) # Calibrate the state level dataset with sparsity state_level_calibrated_dataset = calibrate_single_geography_level( @@ -106,6 +105,7 @@ def test_calibration_per_geographic_level_iteration(): areas_in_state_level, "hf://policyengine/policyengine-us-data/cps_2023.h5", dataset_subsample_size=1000, # approximately 5% of the base dataset to decrease computation costs + epochs=300, use_dataset_weights=False, regularize_with_l0=True, ) @@ -125,6 +125,7 @@ def test_calibration_per_geographic_level_iteration(): dataset="Dataset_state_level.h5", stack_datasets=False, noise_level=0.0, + epochs=300, use_dataset_weights=True, # use the previously calibrated weights regularize_with_l0=False, ) @@ -144,7 +145,7 @@ def test_calibration_per_geographic_level_iteration(): assert ( state_level_weights - national_level_weights - ).sum() > 0, "Household weights do not differ between state and national levels, suggesting national calibration was unsucessful." + ).sum() != 0, "Household weights do not differ between state and national levels, suggesting national calibration was unsucessful." def test_calibration_combining_all_levels_at_once(): @@ -154,6 +155,7 @@ def test_calibration_combining_all_levels_at_once(): Conversion between dataset class types is necessary until full migration to the new SingleYearDataset class in the policyengine_core repository. """ from policyengine_us import Microsimulation + from policyengine_us.system import system from policyengine_data.tools.legacy_class_conversions import ( SingleYearDataset_to_Dataset, ) @@ -175,14 +177,14 @@ def test_calibration_combining_all_levels_at_once(): db_uri=db_uri, update_database=True ) - # Uprate targets for consistency across definition year (disabled until IRS SOI variables are renamed to avoid errors) - # uprating_results = uprate_calibration_targets( - # system=system, - # db_uri=db_uri, - # from_period=2022, - # to_period=2023, - # update_database=True, - # ) + # Uprate targets for consistency across definition year + uprating_results = uprate_calibration_targets( + system=system, + db_uri=db_uri, + from_period=2022, + to_period=2023, + update_database=True, + ) # Calibrate the full dataset at once (only passing the identifyers of the areas for which the base dataset will be stacked) fully_calibrated_dataset = calibrate_all_levels( @@ -191,6 +193,7 @@ def test_calibration_combining_all_levels_at_once(): "hf://policyengine/policyengine-us-data/cps_2023.h5", geo_hierarchy=["0100000US", "0400000US"], dataset_subsample_size=1000, + epochs=300, regularize_with_l0=True, raise_error=False, # this will avoid raising an error if some targets have no records contributing to them (given sampling) ) From 55b3fdc911b913a8c43264a5c6997c6f08dc7c93 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Fri, 22 Aug 2025 09:56:53 +0200 Subject: [PATCH 6/6] add documentation --- docs/dataset.ipynb | 17 +++++++++++++++-- .../calibration/dataset_duplication.py | 2 +- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/docs/dataset.ipynb b/docs/dataset.ipynb index 65670f0..f4b49f6 100644 --- a/docs/dataset.ipynb +++ b/docs/dataset.ipynb @@ -169,9 +169,17 @@ "#### Method 3: From a PolicyEngine MicroSimulation" ] }, + { + "cell_type": "markdown", + "id": "f7545881", + "metadata": {}, + "source": [ + "Note that, by default, the `from_simulation()` method only loads input variables, which come from the underlying dataset. This may create problems later on if when trying to calculate specific variables using the loaded SingleYearDataset without running the whole microsimulation. The `include_all_variables` parameter, when set to `True` will calculate all variables in the microsimulation object when loading the SingleYearDataset to avoid problems down the line. However, this is quite computationally heavy. Instead, the parameter `additional_variables` enables passing a dictionary of specific \"calculated\" variables to be loaded, avoiding computing variables that will not be needed." + ] + }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "4f1a90f6", "metadata": {}, "outputs": [ @@ -192,7 +200,12 @@ "\n", "sim = Microsimulation(dataset=dataset)\n", "\n", - "single_year_dataset = SingleYearDataset.from_simulation(sim, time_period=start_year)\n", + "single_year_dataset = SingleYearDataset.from_simulation(\n", + " sim, \n", + " time_period=start_year,\n", + " include_all_variables=False,\n", + " additional_variables={\"person\": [\"employment_income\"]},\n", + ")\n", "single_year_dataset.time_period = start_year\n", "\n", "print(f\"Dataset created from PolicyEngine US microdata stored in {dataset}\")\n", diff --git a/src/policyengine_data/calibration/dataset_duplication.py b/src/policyengine_data/calibration/dataset_duplication.py index 9a2846e..52a4f3e 100644 --- a/src/policyengine_data/calibration/dataset_duplication.py +++ b/src/policyengine_data/calibration/dataset_duplication.py @@ -19,7 +19,7 @@ def identify_calculated_variables( microsimulation_class, ) -> dict: """ - Identify calculated variables in a dataset by comparing with input variables. + Identify calculated variables in a dataset by comparing with input variables. "Input" variables come from the underlying dataset. "Calculated" variables come from simulation formulas in one of the country-specific packages. Args: dataset_path: Path to the dataset file (e.g., "cps_2023.h5")