From 84636648a7c47df4155ea1edf3e0685d5ad29f3f Mon Sep 17 00:00:00 2001 From: nour-taqatqa Date: Sun, 19 Oct 2025 11:10:46 -0500 Subject: [PATCH 1/6] Add test for Arrow decimal groupby variance --- pandas/core/groupby/groupby.py | 4 +- pandas/tests/groupby/aggregate/test_cython.py | 61 +++++++++++++++++++ 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fe7bf5bbc4c2c..12522b1c78e13 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1758,9 +1758,9 @@ def _cython_agg_general( data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how) - def array_func(values: ArrayLike) -> ArrayLike: + def array_func(values: ArrayLike) -> ArrayLike: try: - result = self._grouper._cython_operation( + result = self._grouper._cython_operation( "aggregate", values, how, diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index a706ea795a0e2..4dccc660ef13e 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -20,7 +20,10 @@ Timestamp, bdate_range, ) +import pyarrow as pa +import decimal import pandas._testing as tm +import math @pytest.mark.parametrize( @@ -413,3 +416,61 @@ def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na): result = grouped["col"].aggregate(op_name) assert result.dtype == expected_dtype + +#testing groupby.var() when called with pyarrow datatype + +@pytest.mark.parametrize("with_na", [False, True]) +def test_groupby_var_arrow_decimal(with_na): + # Create Arrow-backed decimal Series + data = pd.Series( + [ + decimal.Decimal("123.000"), + decimal.Decimal("12.000"), + decimal.Decimal("5.5"), + decimal.Decimal("7.25") + ], + dtype=pd.ArrowDtype(pa.decimal128(6, 3)) + ) + + if with_na: + data.iloc[3] = pd.NA # introduce a missing value + + df = DataFrame({"key": ["a", "a", "b", "b"], "col": data}) + grouped = df.groupby("key") + + # Perform the aggregation using .var() (calls _cython_agg_general internally) + result = grouped.var()#it correctly converts it to double[pyarrow] + + + # Check dtype is still Arrow double + expected_dtype = pd.ArrowDtype(pa.float64()) + assert isinstance(result["col"].dtype, pd.ArrowDtype) + assert result["col"].dtype == expected_dtype + + + # Compute expected variance manually for group "a" + vals_a = [123.0, 12.0] # convert to float + if with_na: + vals_b = [5.5] # single value → var is NA + else: + vals_b = [5.5, 7.25] + + # Compute variance using pandas (float) + expected_var_a = pd.Series(vals_a).var() + expected_var_b = pd.Series(vals_b).var() if len(vals_b) > 1 else pd.NA + + # Helper function for float comparison with NA support + def _almost_equal_or_na(a, b, tol=1e-12): + if pd.isna(a) and pd.isna(b): + return True + return math.isclose(float(a), float(b), rel_tol=tol, abs_tol=tol) + + # Compare the DataFrame result + assert _almost_equal_or_na(result.loc["a", "col"], expected_var_a) + assert _almost_equal_or_na(result.loc["b", "col"], expected_var_b) + + # Also test the SeriesGroupBy path + result_series = grouped["col"].var() + assert _almost_equal_or_na(result_series.loc["a"], expected_var_a) + assert _almost_equal_or_na(result_series.loc["b"], expected_var_b) + From d2d716a845582a42bcb455df496b88e3a477cc7f Mon Sep 17 00:00:00 2001 From: nour-taqatqa Date: Sun, 19 Oct 2025 11:18:14 -0500 Subject: [PATCH 2/6] edited comment for test\ --- pandas/tests/groupby/aggregate/test_cython.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 4dccc660ef13e..93c9eb89327ad 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -417,10 +417,10 @@ def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na): result = grouped["col"].aggregate(op_name) assert result.dtype == expected_dtype -#testing groupby.var() when called with pyarrow datatype @pytest.mark.parametrize("with_na", [False, True]) def test_groupby_var_arrow_decimal(with_na): + #testing groupby.var() when called with pyarrow datatype # Create Arrow-backed decimal Series data = pd.Series( [ From 2781b3ea7ce9f822378c17eb552ccac79cc48dc4 Mon Sep 17 00:00:00 2001 From: nour-taqatqa Date: Tue, 21 Oct 2025 15:40:33 -0500 Subject: [PATCH 3/6] reverted groupby.py to upstream/main --- pandas/core/groupby/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 12522b1c78e13..fe7bf5bbc4c2c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1758,9 +1758,9 @@ def _cython_agg_general( data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how) - def array_func(values: ArrayLike) -> ArrayLike: + def array_func(values: ArrayLike) -> ArrayLike: try: - result = self._grouper._cython_operation( + result = self._grouper._cython_operation( "aggregate", values, how, From b2baf44573d63a2fb1a1841c0b485ab62af05f51 Mon Sep 17 00:00:00 2001 From: nour-taqatqa Date: Tue, 21 Oct 2025 16:38:06 -0500 Subject: [PATCH 4/6] Fix test for Arrow-backed variance and update expected dtype --- pandas/tests/extension/test_arrow.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c810f098f15cf..18b0220478594 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3700,3 +3700,10 @@ def test_pow_with_all_na_float(): result = s.pow(2) expected = pd.Series([pd.NA, pd.NA], dtype="float64[pyarrow]") tm.assert_series_equal(result, expected) + +def test_groupby_var_arrow_decimal(): + df = pd.DataFrame({"A": pd.Series([True, True], dtype="bool[pyarrow]"), "B": pd.Series([Decimal(123), Decimal(12)], dtype=pd.ArrowDtype(pa.decimal128(6,3)))}) + result=df.groupby("A").var().dtypes + expected = pd.Series([pd.ArrowDtype(pa.float64())], index=result.index) + + tm.assert_series_equal(result, expected) \ No newline at end of file From 2fce7681be0686c43fb6153301425728d802832b Mon Sep 17 00:00:00 2001 From: nour-taqatqa Date: Tue, 21 Oct 2025 16:41:20 -0500 Subject: [PATCH 5/6] Delete old test from test_cython.py --- pandas/tests/groupby/aggregate/test_cython.py | 60 +------------------ 1 file changed, 1 insertion(+), 59 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 93c9eb89327ad..3b57f2ff88f4e 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -415,62 +415,4 @@ def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na): assert result.dtype == expected_dtype result = grouped["col"].aggregate(op_name) - assert result.dtype == expected_dtype - - -@pytest.mark.parametrize("with_na", [False, True]) -def test_groupby_var_arrow_decimal(with_na): - #testing groupby.var() when called with pyarrow datatype - # Create Arrow-backed decimal Series - data = pd.Series( - [ - decimal.Decimal("123.000"), - decimal.Decimal("12.000"), - decimal.Decimal("5.5"), - decimal.Decimal("7.25") - ], - dtype=pd.ArrowDtype(pa.decimal128(6, 3)) - ) - - if with_na: - data.iloc[3] = pd.NA # introduce a missing value - - df = DataFrame({"key": ["a", "a", "b", "b"], "col": data}) - grouped = df.groupby("key") - - # Perform the aggregation using .var() (calls _cython_agg_general internally) - result = grouped.var()#it correctly converts it to double[pyarrow] - - - # Check dtype is still Arrow double - expected_dtype = pd.ArrowDtype(pa.float64()) - assert isinstance(result["col"].dtype, pd.ArrowDtype) - assert result["col"].dtype == expected_dtype - - - # Compute expected variance manually for group "a" - vals_a = [123.0, 12.0] # convert to float - if with_na: - vals_b = [5.5] # single value → var is NA - else: - vals_b = [5.5, 7.25] - - # Compute variance using pandas (float) - expected_var_a = pd.Series(vals_a).var() - expected_var_b = pd.Series(vals_b).var() if len(vals_b) > 1 else pd.NA - - # Helper function for float comparison with NA support - def _almost_equal_or_na(a, b, tol=1e-12): - if pd.isna(a) and pd.isna(b): - return True - return math.isclose(float(a), float(b), rel_tol=tol, abs_tol=tol) - - # Compare the DataFrame result - assert _almost_equal_or_na(result.loc["a", "col"], expected_var_a) - assert _almost_equal_or_na(result.loc["b", "col"], expected_var_b) - - # Also test the SeriesGroupBy path - result_series = grouped["col"].var() - assert _almost_equal_or_na(result_series.loc["a"], expected_var_a) - assert _almost_equal_or_na(result_series.loc["b"], expected_var_b) - + assert result.dtype == expected_dtype \ No newline at end of file From 537a152a9ee6334dde87f5676090f35ed6c75f95 Mon Sep 17 00:00:00 2001 From: nour-taqatqa Date: Tue, 21 Oct 2025 16:44:51 -0500 Subject: [PATCH 6/6] Revert test_cython.py to upstream/main --- pandas/tests/groupby/aggregate/test_cython.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 3b57f2ff88f4e..a706ea795a0e2 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -20,10 +20,7 @@ Timestamp, bdate_range, ) -import pyarrow as pa -import decimal import pandas._testing as tm -import math @pytest.mark.parametrize( @@ -415,4 +412,4 @@ def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na): assert result.dtype == expected_dtype result = grouped["col"].aggregate(op_name) - assert result.dtype == expected_dtype \ No newline at end of file + assert result.dtype == expected_dtype