From b8aa29ea3d2006581417a4fb5dc3250b78176c52 Mon Sep 17 00:00:00 2001 From: Jerry Guo Date: Wed, 17 Sep 2025 10:19:08 +0200 Subject: [PATCH 1/9] convert Name field to string type Signed-off-by: Jerry Guo --- .../data_stores/excel_file_store.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/power_grid_model_io/data_stores/excel_file_store.py b/src/power_grid_model_io/data_stores/excel_file_store.py index 5e2daf2b..8d0c9700 100644 --- a/src/power_grid_model_io/data_stores/excel_file_store.py +++ b/src/power_grid_model_io/data_stores/excel_file_store.py @@ -93,10 +93,20 @@ def sheet_loader(): sheet_data = self._handle_duplicate_columns(data=sheet_data, sheet_name=xls_sheet_name) sheet_data = self._process_uuid_columns(data=sheet_data, sheet_name=xls_sheet_name) sheet_data = self._update_column_names(data=sheet_data) + # Only convert large integer values to strings for columns named 'Name' + for col in sheet_data.columns: + if (col == "Name" or (isinstance(col, tuple) and col[0] == "Name")) and sheet_data[col].dtype in [ + "float64", + "int64", + ]: + if (sheet_data[col].abs() >= 1e12).any(): + sheet_data[col] = sheet_data[col].apply( + lambda x: str(int(x)) + if pd.notnull(x) and isinstance(x, (int, float)) and abs(x) >= 1e12 + else x + ) return sheet_data - return sheet_loader - data: Dict[str, LazyDataFrame] = {} for name, path in self._file_paths.items(): excel_file = pd.ExcelFile(path) From d0a7f0a483e9a963cd7eb951ac299d337f366a19 Mon Sep 17 00:00:00 2001 From: Jerry Guo Date: Wed, 17 Sep 2025 10:29:14 +0200 Subject: [PATCH 2/9] Consistency for 'Name' Signed-off-by: Jerry Guo --- .../data_stores/excel_file_store.py | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/power_grid_model_io/data_stores/excel_file_store.py b/src/power_grid_model_io/data_stores/excel_file_store.py index 8d0c9700..2afa450d 100644 --- a/src/power_grid_model_io/data_stores/excel_file_store.py +++ b/src/power_grid_model_io/data_stores/excel_file_store.py @@ -88,23 +88,18 @@ def load(self) -> TabularData: def lazy_sheet_loader(xls_file: pd.ExcelFile, xls_sheet_name: str): def sheet_loader(): - sheet_data = xls_file.parse(xls_sheet_name, header=self._header_rows) + preview = xls_file.parse(xls_sheet_name, header=self._header_rows, nrows=0) + columns = list(preview.columns) + dtype = {} + for col in columns: + if col == "Name" or (isinstance(col, tuple) and col[0] == "Name"): + dtype[col] = str + sheet_data = xls_file.parse(xls_sheet_name, header=self._header_rows, dtype=dtype) sheet_data = self._remove_unnamed_column_placeholders(data=sheet_data) sheet_data = self._handle_duplicate_columns(data=sheet_data, sheet_name=xls_sheet_name) sheet_data = self._process_uuid_columns(data=sheet_data, sheet_name=xls_sheet_name) sheet_data = self._update_column_names(data=sheet_data) - # Only convert large integer values to strings for columns named 'Name' - for col in sheet_data.columns: - if (col == "Name" or (isinstance(col, tuple) and col[0] == "Name")) and sheet_data[col].dtype in [ - "float64", - "int64", - ]: - if (sheet_data[col].abs() >= 1e12).any(): - sheet_data[col] = sheet_data[col].apply( - lambda x: str(int(x)) - if pd.notnull(x) and isinstance(x, (int, float)) and abs(x) >= 1e12 - else x - ) + return sheet_data data: Dict[str, LazyDataFrame] = {} From c1772e33cc70cc8893ae2033a4766c2d9c24a8ca Mon Sep 17 00:00:00 2001 From: Jerry Guo Date: Wed, 17 Sep 2025 14:09:06 +0200 Subject: [PATCH 3/9] InObject.Name Signed-off-by: Jerry Guo --- src/power_grid_model_io/data_stores/excel_file_store.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/power_grid_model_io/data_stores/excel_file_store.py b/src/power_grid_model_io/data_stores/excel_file_store.py index 2afa450d..2dcbe32e 100644 --- a/src/power_grid_model_io/data_stores/excel_file_store.py +++ b/src/power_grid_model_io/data_stores/excel_file_store.py @@ -92,7 +92,11 @@ def sheet_loader(): columns = list(preview.columns) dtype = {} for col in columns: - if col == "Name" or (isinstance(col, tuple) and col[0] == "Name"): + if ( + col == "Name" + or col == "InObject.Name" + or (isinstance(col, tuple) and (col[0] == "Name" or col[0] == "InObject.Name")) + ): dtype[col] = str sheet_data = xls_file.parse(xls_sheet_name, header=self._header_rows, dtype=dtype) sheet_data = self._remove_unnamed_column_placeholders(data=sheet_data) From 7d023e29ca5dc3ed06d56177d20d4403e00186ce Mon Sep 17 00:00:00 2001 From: Jerry Guo Date: Wed, 17 Sep 2025 14:13:57 +0200 Subject: [PATCH 4/9] contain substring for all *Name Signed-off-by: Jerry Guo --- src/power_grid_model_io/data_stores/excel_file_store.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/power_grid_model_io/data_stores/excel_file_store.py b/src/power_grid_model_io/data_stores/excel_file_store.py index 2dcbe32e..ecdd74f9 100644 --- a/src/power_grid_model_io/data_stores/excel_file_store.py +++ b/src/power_grid_model_io/data_stores/excel_file_store.py @@ -92,11 +92,7 @@ def sheet_loader(): columns = list(preview.columns) dtype = {} for col in columns: - if ( - col == "Name" - or col == "InObject.Name" - or (isinstance(col, tuple) and (col[0] == "Name" or col[0] == "InObject.Name")) - ): + if "Name" in str(col) or (isinstance(col, tuple) and ("Name" in str(col[0]))): dtype[col] = str sheet_data = xls_file.parse(xls_sheet_name, header=self._header_rows, dtype=dtype) sheet_data = self._remove_unnamed_column_placeholders(data=sheet_data) From ce73c5c02849677f8c1a5daaf8d5457060af3174 Mon Sep 17 00:00:00 2001 From: Jerry Guo Date: Wed, 17 Sep 2025 14:15:40 +0200 Subject: [PATCH 5/9] sheet_loader Signed-off-by: Jerry Guo --- src/power_grid_model_io/data_stores/excel_file_store.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/power_grid_model_io/data_stores/excel_file_store.py b/src/power_grid_model_io/data_stores/excel_file_store.py index ecdd74f9..86e9b0b7 100644 --- a/src/power_grid_model_io/data_stores/excel_file_store.py +++ b/src/power_grid_model_io/data_stores/excel_file_store.py @@ -102,6 +102,8 @@ def sheet_loader(): return sheet_data + return sheet_loader + data: Dict[str, LazyDataFrame] = {} for name, path in self._file_paths.items(): excel_file = pd.ExcelFile(path) From 008c9d98ffbbeff1e929700ab6abf87f3f613a27 Mon Sep 17 00:00:00 2001 From: Jerry Guo Date: Fri, 19 Sep 2025 17:23:07 +0200 Subject: [PATCH 6/9] fix failed test Signed-off-by: Jerry Guo --- tests/unit/data_stores/test_vision_excel_file_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/data_stores/test_vision_excel_file_store.py b/tests/unit/data_stores/test_vision_excel_file_store.py index cdb3ddc6..dbfd7923 100644 --- a/tests/unit/data_stores/test_vision_excel_file_store.py +++ b/tests/unit/data_stores/test_vision_excel_file_store.py @@ -19,4 +19,4 @@ def test_header_rows(mock_excel_file: MagicMock): data["foo"] # Assert - mock_excel_file.return_value.parse.assert_called_once_with("foo", header=[0, 1]) + assert mock_excel_file.return_value.parse.call_count == 2 From 178204bb1d31f563138ac6bb471296b18d6ee41c Mon Sep 17 00:00:00 2001 From: Jerry Guo Date: Fri, 19 Sep 2025 17:41:54 +0200 Subject: [PATCH 7/9] added test Signed-off-by: Jerry Guo --- .../test_vision_excel_file_store.py | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tests/unit/data_stores/test_vision_excel_file_store.py b/tests/unit/data_stores/test_vision_excel_file_store.py index dbfd7923..fc591ae0 100644 --- a/tests/unit/data_stores/test_vision_excel_file_store.py +++ b/tests/unit/data_stores/test_vision_excel_file_store.py @@ -4,6 +4,8 @@ from pathlib import Path from unittest.mock import MagicMock, mock_open, patch +import pandas as pd + from power_grid_model_io.data_stores.vision_excel_file_store import VisionExcelFileStore @@ -20,3 +22,55 @@ def test_header_rows(mock_excel_file: MagicMock): # Assert assert mock_excel_file.return_value.parse.call_count == 2 + + +@patch("power_grid_model_io.data_stores.excel_file_store.pd.ExcelFile") +@patch("power_grid_model_io.data_stores.excel_file_store.Path.open", mock_open()) +def test_name_column_dtype_conversion(mock_excel_file: MagicMock): + store = VisionExcelFileStore(file_path=Path("dummy.xlsx")) + mock_excel_file.return_value.sheet_names = ["test_sheet"] + + preview_df = pd.DataFrame(columns=["Mock.Name", "Other.Column", "ID"]) + + def mock_parse(*args, **kwargs): + if kwargs.get("nrows") == 0: + return preview_df + else: + actual_data = { + "Mock.Name": [123456789, 987.654], + "Other.Column": ["value1", "value2"], + "ID": [1, 2], + "ratio": [0.1, 0.2], + } + df = pd.DataFrame(actual_data) + + if "dtype" in kwargs: + for col, dtype_val in kwargs["dtype"].items(): + if col in df.columns and dtype_val is str: + df[col] = df[col].apply(lambda x: str(int(x)) if float(x).is_integer() else str(x)) + + return df + + mock_excel_file.return_value.parse.side_effect = mock_parse + + data = store.load() + result_df = data["test_sheet"] + + assert mock_excel_file.return_value.parse.call_count == 2 + + first_call = mock_excel_file.return_value.parse.call_args_list[0] + assert first_call[1]["nrows"] == 0 + + second_call = mock_excel_file.return_value.parse.call_args_list[1] + assert "dtype" in second_call[1] + assert "Mock.Name" in second_call[1]["dtype"] + assert second_call[1]["dtype"]["Mock.Name"] is str + + assert result_df["Mock.Name"][0] == "123456789" # Long int as string + assert result_df["Mock.Name"][1] == "987.654" # Float as string + assert result_df["Other.Column"][0] == "value1" + assert result_df["Other.Column"][1] == "value2" + assert result_df["ID"][0] == 1 + assert result_df["ID"][1] == 2 + assert result_df["ratio"][0] == 0.1 + assert result_df["ratio"][1] == 0.2 From 970c76c0c8141e7ced632590c099fa3501f32b59 Mon Sep 17 00:00:00 2001 From: Jerry Guo Date: Wed, 24 Sep 2025 10:59:21 +0200 Subject: [PATCH 8/9] longer integer to trigger the loss of precision Signed-off-by: Jerry Guo --- tests/unit/data_stores/test_vision_excel_file_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/data_stores/test_vision_excel_file_store.py b/tests/unit/data_stores/test_vision_excel_file_store.py index fc591ae0..84e2a183 100644 --- a/tests/unit/data_stores/test_vision_excel_file_store.py +++ b/tests/unit/data_stores/test_vision_excel_file_store.py @@ -37,7 +37,7 @@ def mock_parse(*args, **kwargs): return preview_df else: actual_data = { - "Mock.Name": [123456789, 987.654], + "Mock.Name": [123456789000000, 987.654], "Other.Column": ["value1", "value2"], "ID": [1, 2], "ratio": [0.1, 0.2], @@ -66,7 +66,7 @@ def mock_parse(*args, **kwargs): assert "Mock.Name" in second_call[1]["dtype"] assert second_call[1]["dtype"]["Mock.Name"] is str - assert result_df["Mock.Name"][0] == "123456789" # Long int as string + assert result_df["Mock.Name"][0] == "123456789000000" # Long int as string assert result_df["Mock.Name"][1] == "987.654" # Float as string assert result_df["Other.Column"][0] == "value1" assert result_df["Other.Column"][1] == "value2" From 2755d3946fb6e4ea1858fc6a7b9279576ab61669 Mon Sep 17 00:00:00 2001 From: Nitish Bharambe <78108900+nitbharambe@users.noreply.github.com> Date: Wed, 24 Sep 2025 11:05:45 +0200 Subject: [PATCH 9/9] Apply suggestions from code review Signed-off-by: Nitish Bharambe <78108900+nitbharambe@users.noreply.github.com> --- tests/unit/data_stores/test_vision_excel_file_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/data_stores/test_vision_excel_file_store.py b/tests/unit/data_stores/test_vision_excel_file_store.py index 84e2a183..ba08cc7c 100644 --- a/tests/unit/data_stores/test_vision_excel_file_store.py +++ b/tests/unit/data_stores/test_vision_excel_file_store.py @@ -37,7 +37,7 @@ def mock_parse(*args, **kwargs): return preview_df else: actual_data = { - "Mock.Name": [123456789000000, 987.654], + "Mock.Name": [12345678900000000000, 987.654], "Other.Column": ["value1", "value2"], "ID": [1, 2], "ratio": [0.1, 0.2], @@ -66,7 +66,7 @@ def mock_parse(*args, **kwargs): assert "Mock.Name" in second_call[1]["dtype"] assert second_call[1]["dtype"]["Mock.Name"] is str - assert result_df["Mock.Name"][0] == "123456789000000" # Long int as string + assert result_df["Mock.Name"][0] == "12345678900000000000" # Long int as string assert result_df["Mock.Name"][1] == "987.654" # Float as string assert result_df["Other.Column"][0] == "value1" assert result_df["Other.Column"][1] == "value2"