diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29..34ee3c8 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Key normalisation functionality. diff --git a/docs/normalise_keys.md b/docs/normalise_keys.md new file mode 100644 index 0000000..1f324cc --- /dev/null +++ b/docs/normalise_keys.md @@ -0,0 +1,103 @@ +# Key normalisation + +The `normalise_keys` module provides utilities for normalising primary and foreign keys in related database tables to zero-based sequential indices while preserving relationships. + +## Functions + +### `normalise_table_keys(tables, primary_keys, foreign_keys=None)` + +Normalises primary and foreign keys across multiple related tables. + +**Parameters:** +- `tables` (Dict[str, pd.DataFrame]): Dictionary mapping table names to DataFrames +- `primary_keys` (Dict[str, str]): Dictionary mapping table names to their primary key column names +- `foreign_keys` (Optional[Dict[str, Dict[str, str]]]): Dictionary mapping table names to their foreign key relationships. Format: `{table_name: {fk_column: referenced_table}}`. If None, foreign keys are auto-detected. + +**Returns:** +- Dict[str, pd.DataFrame]: Dictionary of normalised tables with zero-based integer keys + +**Example:** +```python +import pandas as pd +from policyengine_data import normalise_table_keys + +users = pd.DataFrame({ + 'user_id': [101, 105, 103], + 'name': ['Alice', 'Bob', 'Carol'] +}) + +orders = pd.DataFrame({ + 'order_id': [201, 205, 207], + 'user_id': [105, 101, 105], + 'amount': [25.99, 15.50, 42.00] +}) + +tables = {'users': users, 'orders': orders} +primary_keys = {'users': 'user_id', 'orders': 'order_id'} + +# Auto-detect foreign keys +normalised = normalise_table_keys(tables, primary_keys) + +# Or specify foreign keys explicitly +foreign_keys = {'orders': {'user_id': 'users'}} +normalised = normalise_table_keys(tables, primary_keys, foreign_keys) +``` + +After normalisation: +- User IDs become 0, 1, 2 (instead of 101, 105, 103) +- Order IDs become 0, 1, 2 (instead of 201, 205, 207) +- Foreign key relationships are preserved (Bob's orders still reference Bob's new ID) + +### `normalise_single_table_keys(df, key_column, start_index=0)` + +Normalises keys in a single table to sequential indices. + +**Parameters:** +- `df` (pd.DataFrame): DataFrame to normalise +- `key_column` (str): Name of the key column to normalise +- `start_index` (int): Starting index for normalisation (default: 0) + +**Returns:** +- pd.DataFrame: DataFrame with normalised keys + +**Example:** +```python +import pandas as pd +from policyengine_data import normalise_single_table_keys + +df = pd.DataFrame({ + 'id': [101, 105, 103], + 'value': ['A', 'B', 'C'] +}) + +normalised = normalise_single_table_keys(df, 'id') +# Result: IDs become 0, 1, 2 +``` + +## Key features + +- **Relationship preservation**: All foreign key relationships between tables are maintained after normalisation +- **Auto-detection**: Foreign keys can be automatically detected based on column name matching +- **Zero-based indexing**: Keys are normalised to start from 0 and increment sequentially +- **Flexible input**: Works with any pandas DataFrames and column names +- **Error handling**: Clear error messages for missing columns or invalid references +- **Duplicate handling**: Properly handles duplicate keys within tables + +## Use cases + +This functionality is particularly useful for: + +- Preparing data for machine learning models that expect sequential indices +- Converting legacy database exports with non-sequential primary keys +- Standardising key formats across multiple related datasets +- Reducing memory usage by converting large integer keys to compact sequential indices +- Creating consistent test datasets with predictable key patterns + +## Implementation notes + +The normalisation process works in two phases: + +1. **Mapping creation**: Unique values in each primary key column are mapped to zero-based sequential integers +2. **Application**: These mappings are applied to both primary keys and corresponding foreign keys across all tables + +Foreign key auto-detection works by identifying columns that share names with primary key columns from other tables. For more complex relationships, explicit foreign key specification is recommended. \ No newline at end of file diff --git a/src/policyengine_data/__init__.py b/src/policyengine_data/__init__.py index 9a4f8a7..58b2423 100644 --- a/src/policyengine_data/__init__.py +++ b/src/policyengine_data/__init__.py @@ -1,3 +1,4 @@ from .dataset_legacy import Dataset from .multi_year_dataset import MultiYearDataset +from .normalise_keys import normalise_single_table_keys, normalise_table_keys from .single_year_dataset import SingleYearDataset diff --git a/src/policyengine_data/normalise_keys.py b/src/policyengine_data/normalise_keys.py new file mode 100644 index 0000000..49189b4 --- /dev/null +++ b/src/policyengine_data/normalise_keys.py @@ -0,0 +1,187 @@ +""" +Key normalisation utilities for tables with primary and foreign keys. + +This module provides functionality to normalise primary and foreign keys +in related tables to zero-based sequential indices while preserving +relationships between tables. +""" + +from typing import Any, Dict, List, Optional, Union + +import pandas as pd + + +def normalise_table_keys( + tables: Dict[str, pd.DataFrame], + primary_keys: Dict[str, str], + foreign_keys: Optional[Dict[str, Dict[str, str]]] = None, + start_index: Optional[int] = 0, +) -> Dict[str, pd.DataFrame]: + """ + Normalise primary and foreign keys across multiple tables to zero-based indices. + + This function takes a collection of related tables and converts their primary + and foreign keys to `start_index`-based sequential integers while preserving all + relationships between tables. + + Args: + tables: Dictionary mapping table names to DataFrames + primary_keys: Dictionary mapping table names to their primary key column names + foreign_keys: Optional dictionary mapping table names to their foreign key + relationships. Format: {table_name: {fk_column: referenced_table}} + If None, foreign keys will be auto-detected based on column names + matching primary key names from other tables. + start_index: Starting index for normalisation (default: 0). + + Returns: + Dictionary of normalised tables with `start_index`-based integer keys + + Example: + >>> users = pd.DataFrame({ + ... 'user_id': [101, 105, 103], + ... 'name': ['Alice', 'Bob', 'Carol'] + ... }) + >>> orders = pd.DataFrame({ + ... 'order_id': [201, 205, 207], + ... 'user_id': [105, 101, 105], + ... 'amount': [25.99, 15.50, 42.00] + ... }) + >>> tables = {'users': users, 'orders': orders} + >>> primary_keys = {'users': 'user_id', 'orders': 'order_id'} + >>> foreign_keys = {'orders': {'user_id': 'users'}} + >>> normalised = normalise_table_keys(tables, primary_keys, foreign_keys) + >>> # Result: user_ids become 0,1,2 and order_ids become 0,1,2 + >>> # with foreign key relationships preserved + """ + if not tables: + return {} + + if foreign_keys is None: + foreign_keys = _auto_detect_foreign_keys(tables, primary_keys) + + # Create mapping dictionaries for each primary key + key_mappings = {} + normalised_tables = {} + + # First pass: create mappings for primary keys + for table_name, df in tables.items(): + if table_name not in primary_keys: + raise ValueError( + f"No primary key specified for table '{table_name}'" + ) + + pk_column = primary_keys[table_name] + if pk_column not in df.columns: + raise ValueError( + f"Primary key column '{pk_column}' not found in table '{table_name}'" + ) + + # Get unique values and create zero-based mapping + unique_keys = df[pk_column].unique() + key_mappings[table_name] = { + old_key: new_key + start_index + for new_key, old_key in enumerate(unique_keys) + } + + # Second pass: apply mappings to all tables + for table_name, df in tables.items(): + normalised_df = df.copy() + pk_column = primary_keys[table_name] + + # Map primary key + normalised_df[pk_column] = normalised_df[pk_column].map( + key_mappings[table_name] + ) + + # Map foreign keys + if table_name in foreign_keys: + for fk_column, referenced_table in foreign_keys[ + table_name + ].items(): + if fk_column not in df.columns: + raise ValueError( + f"Foreign key column '{fk_column}' not found in table '{table_name}'" + ) + if referenced_table not in key_mappings: + raise ValueError( + f"Referenced table '{referenced_table}' not found" + ) + + normalised_df[fk_column] = normalised_df[fk_column].map( + key_mappings[referenced_table] + ) + + normalised_tables[table_name] = normalised_df + + return normalised_tables + + +def _auto_detect_foreign_keys( + tables: Dict[str, pd.DataFrame], primary_keys: Dict[str, str] +) -> Dict[str, Dict[str, str]]: + """ + Auto-detect foreign key relationships based on column name matching. + + Args: + tables: Dictionary of table names to DataFrames + primary_keys: Dictionary of primary key column names per table + + Returns: + Dictionary of detected foreign key relationships + """ + foreign_keys = {} + pk_columns = set(primary_keys.values()) + + for table_name, df in tables.items(): + table_fks = {} + pk_column = primary_keys[table_name] + + # Look for columns that match primary keys from other tables + for column in df.columns: + if column != pk_column and column in pk_columns: + # Find which table this primary key belongs to + for ref_table, ref_pk in primary_keys.items(): + if ref_pk == column and ref_table != table_name: + table_fks[column] = ref_table + break + + if table_fks: + foreign_keys[table_name] = table_fks + + return foreign_keys + + +def normalise_single_table_keys( + df: pd.DataFrame, key_column: str, start_index: int = 0 +) -> pd.DataFrame: + """ + Normalise keys in a single table to sequential indices. + + Args: + df: DataFrame to normalise + key_column: Name of the key column to normalise + start_index: Starting index for normalisation (default: 0) + + Returns: + DataFrame with normalised keys + + Example: + >>> df = pd.DataFrame({ + ... 'id': [101, 105, 103], + ... 'value': ['A', 'B', 'C'] + ... }) + >>> normalised = normalise_single_table_keys(df, 'id') + >>> # Result: ids become 0, 1, 2 + """ + if key_column not in df.columns: + raise ValueError(f"Key column '{key_column}' not found in DataFrame") + + normalised_df = df.copy() + unique_keys = df[key_column].unique() + key_mapping = { + old_key: new_key + start_index + for new_key, old_key in enumerate(unique_keys) + } + + normalised_df[key_column] = normalised_df[key_column].map(key_mapping) + return normalised_df diff --git a/tests/test_normalise_keys.py b/tests/test_normalise_keys.py new file mode 100644 index 0000000..61badd9 --- /dev/null +++ b/tests/test_normalise_keys.py @@ -0,0 +1,335 @@ +""" +Tests for key normalisation functionality. +""" + +import pandas as pd +import pytest + +from policyengine_data.normalise_keys import ( + _auto_detect_foreign_keys, + normalise_single_table_keys, + normalise_table_keys, +) + + +class TestNormaliseTableKeys: + """Test cases for normalise_table_keys function.""" + + def test_simple_single_table(self): + """Test normalisation of a single table with no foreign keys.""" + users = pd.DataFrame( + {"user_id": [101, 105, 103], "name": ["Alice", "Bob", "Carol"]} + ) + + tables = {"users": users} + primary_keys = {"users": "user_id"} + + result = normalise_table_keys(tables, primary_keys) + + assert len(result) == 1 + assert "users" in result + + normalised_users = result["users"] + assert list(normalised_users["user_id"]) == [0, 1, 2] + assert list(normalised_users["name"]) == ["Alice", "Bob", "Carol"] + + def test_custom_start_index(self): + """Test normalisation with custom start index.""" + users = pd.DataFrame( + {"user_id": [101, 105, 103], "name": ["Alice", "Bob", "Carol"]} + ) + + tables = {"users": users} + primary_keys = {"users": "user_id"} + + result = normalise_table_keys(tables, primary_keys, start_index=10) + + assert len(result) == 1 + assert "users" in result + + normalised_users = result["users"] + assert list(normalised_users["user_id"]) == [10, 11, 12] + assert list(normalised_users["name"]) == ["Alice", "Bob", "Carol"] + + def test_two_tables_with_foreign_keys(self): + """Test normalisation with explicit foreign key relationships.""" + users = pd.DataFrame( + {"user_id": [101, 105, 103], "name": ["Alice", "Bob", "Carol"]} + ) + + orders = pd.DataFrame( + { + "order_id": [201, 205, 207], + "user_id": [105, 101, 105], + "amount": [25.99, 15.50, 42.00], + } + ) + + tables = {"users": users, "orders": orders} + primary_keys = {"users": "user_id", "orders": "order_id"} + foreign_keys = {"orders": {"user_id": "users"}} + + result = normalise_table_keys(tables, primary_keys, foreign_keys) + + # Check users table + normalised_users = result["users"] + assert set(normalised_users["user_id"]) == {0, 1, 2} + + # Check orders table + normalised_orders = result["orders"] + assert set(normalised_orders["order_id"]) == {0, 1, 2} + + # Check foreign key relationships are preserved + # Original: user 105 had orders 201, 207 + # After normalisation: find which index 105 became + user_105_new_id = normalised_users[normalised_users["name"] == "Bob"][ + "user_id" + ].iloc[0] + bob_orders = normalised_orders[ + normalised_orders["user_id"] == user_105_new_id + ] + assert len(bob_orders) == 2 + assert set(bob_orders["amount"]) == {25.99, 42.00} + + def test_auto_detect_foreign_keys(self): + """Test automatic detection of foreign key relationships.""" + users = pd.DataFrame( + {"user_id": [101, 105, 103], "name": ["Alice", "Bob", "Carol"]} + ) + + orders = pd.DataFrame( + { + "order_id": [201, 205, 207], + "user_id": [105, 101, 105], + "amount": [25.99, 15.50, 42.00], + } + ) + + tables = {"users": users, "orders": orders} + primary_keys = {"users": "user_id", "orders": "order_id"} + + # Test without explicit foreign keys - should auto-detect + result = normalise_table_keys(tables, primary_keys) + + # Verify relationships are still preserved + normalised_users = result["users"] + normalised_orders = result["orders"] + + # Bob should still have his two orders + user_105_new_id = normalised_users[normalised_users["name"] == "Bob"][ + "user_id" + ].iloc[0] + bob_orders = normalised_orders[ + normalised_orders["user_id"] == user_105_new_id + ] + assert len(bob_orders) == 2 + + def test_multiple_foreign_keys(self): + """Test table with multiple foreign key relationships.""" + users = pd.DataFrame( + {"user_id": [1, 2, 3], "name": ["Alice", "Bob", "Carol"]} + ) + + categories = pd.DataFrame( + { + "category_id": [10, 20, 30], + "category_name": ["Electronics", "Books", "Clothing"], + } + ) + + orders = pd.DataFrame( + { + "order_id": [100, 200, 300], + "user_id": [2, 1, 2], + "category_id": [20, 10, 30], + "amount": [25.99, 15.50, 42.00], + } + ) + + tables = {"users": users, "categories": categories, "orders": orders} + primary_keys = { + "users": "user_id", + "categories": "category_id", + "orders": "order_id", + } + + result = normalise_table_keys(tables, primary_keys) + + # Verify all tables have zero-based keys + for table_name, df in result.items(): + pk_col = primary_keys[table_name] + assert set(df[pk_col]) == {0, 1, 2} + + # Verify relationships preserved + normalised_orders = result["orders"] + normalised_users = result["users"] + + # Bob (original user_id=2) should have 2 orders + bob_new_id = normalised_users[normalised_users["name"] == "Bob"][ + "user_id" + ].iloc[0] + bob_orders = normalised_orders[ + normalised_orders["user_id"] == bob_new_id + ] + assert len(bob_orders) == 2 + + def test_empty_tables(self): + """Test with empty input.""" + result = normalise_table_keys({}, {}) + assert result == {} + + def test_missing_primary_key_column(self): + """Test error handling for missing primary key column.""" + df = pd.DataFrame({"name": ["Alice", "Bob"]}) + tables = {"users": df} + primary_keys = {"users": "missing_id"} + + with pytest.raises( + ValueError, match="Primary key column 'missing_id' not found" + ): + normalise_table_keys(tables, primary_keys) + + def test_missing_foreign_key_column(self): + """Test error handling for missing foreign key column.""" + users = pd.DataFrame({"user_id": [1, 2], "name": ["Alice", "Bob"]}) + orders = pd.DataFrame( + {"order_id": [100, 200], "amount": [25.99, 15.50]} + ) + + tables = {"users": users, "orders": orders} + primary_keys = {"users": "user_id", "orders": "order_id"} + foreign_keys = {"orders": {"missing_user_id": "users"}} + + with pytest.raises( + ValueError, match="Foreign key column 'missing_user_id' not found" + ): + normalise_table_keys(tables, primary_keys, foreign_keys) + + def test_missing_referenced_table(self): + """Test error handling for missing referenced table.""" + orders = pd.DataFrame( + { + "order_id": [100, 200], + "user_id": [1, 2], + "amount": [25.99, 15.50], + } + ) + + tables = {"orders": orders} + primary_keys = {"orders": "order_id"} + foreign_keys = {"orders": {"user_id": "missing_users"}} + + with pytest.raises( + ValueError, match="Referenced table 'missing_users' not found" + ): + normalise_table_keys(tables, primary_keys, foreign_keys) + + +class TestNormaliseSingleTableKeys: + """Test cases for normalise_single_table_keys function.""" + + def test_basic_normalisation(self): + """Test basic single table key normalisation.""" + df = pd.DataFrame({"id": [101, 105, 103], "value": ["A", "B", "C"]}) + + result = normalise_single_table_keys(df, "id") + + assert list(result["id"]) == [0, 1, 2] + assert list(result["value"]) == ["A", "B", "C"] + + def test_custom_start_index(self): + """Test normalisation with custom start index.""" + df = pd.DataFrame({"id": [101, 105, 103], "value": ["A", "B", "C"]}) + + result = normalise_single_table_keys(df, "id", start_index=10) + + assert list(result["id"]) == [10, 11, 12] + assert list(result["value"]) == ["A", "B", "C"] + + def test_duplicate_keys_preserved(self): + """Test that duplicate keys are handled correctly.""" + df = pd.DataFrame( + {"id": [101, 105, 101, 103], "value": ["A", "B", "A2", "C"]} + ) + + result = normalise_single_table_keys(df, "id") + + # Should have 3 unique normalised values (0, 1, 2) for 3 unique original values + unique_normalised = result["id"].unique() + assert len(unique_normalised) == 3 + assert set(unique_normalised) == {0, 1, 2} + + # Duplicate original keys should map to same normalised key + original_101_rows = df[df["id"] == 101] + normalised_101_rows = result[ + result.index.isin(original_101_rows.index) + ] + assert len(normalised_101_rows["id"].unique()) == 1 + + def test_missing_key_column(self): + """Test error handling for missing key column.""" + df = pd.DataFrame({"value": ["A", "B", "C"]}) + + with pytest.raises( + ValueError, match="Key column 'missing_id' not found" + ): + normalise_single_table_keys(df, "missing_id") + + +class TestAutoDetectForeignKeys: + """Test cases for _auto_detect_foreign_keys function.""" + + def test_simple_detection(self): + """Test basic foreign key detection.""" + users = pd.DataFrame({"user_id": [1, 2], "name": ["Alice", "Bob"]}) + orders = pd.DataFrame({"order_id": [100, 200], "user_id": [1, 2]}) + + tables = {"users": users, "orders": orders} + primary_keys = {"users": "user_id", "orders": "order_id"} + + result = _auto_detect_foreign_keys(tables, primary_keys) + + expected = {"orders": {"user_id": "users"}} + assert result == expected + + def test_no_foreign_keys(self): + """Test when no foreign keys are detected.""" + users = pd.DataFrame({"user_id": [1, 2], "name": ["Alice", "Bob"]}) + products = pd.DataFrame( + {"product_id": [100, 200], "name": ["Widget", "Gadget"]} + ) + + tables = {"users": users, "products": products} + primary_keys = {"users": "user_id", "products": "product_id"} + + result = _auto_detect_foreign_keys(tables, primary_keys) + + assert result == {} + + def test_multiple_foreign_keys_detection(self): + """Test detection of multiple foreign keys in one table.""" + users = pd.DataFrame({"user_id": [1, 2], "name": ["Alice", "Bob"]}) + categories = pd.DataFrame( + {"category_id": [10, 20], "name": ["Electronics", "Books"]} + ) + orders = pd.DataFrame( + { + "order_id": [100, 200], + "user_id": [1, 2], + "category_id": [10, 20], + } + ) + + tables = {"users": users, "categories": categories, "orders": orders} + primary_keys = { + "users": "user_id", + "categories": "category_id", + "orders": "order_id", + } + + result = _auto_detect_foreign_keys(tables, primary_keys) + + expected = { + "orders": {"user_id": "users", "category_id": "categories"} + } + assert result == expected