From 84636648a7c47df4155ea1edf3e0685d5ad29f3f Mon Sep 17 00:00:00 2001
From: nour-taqatqa <nourtaqatqa2025@u.northwestern.edu>
Date: Sun, 19 Oct 2025 11:10:46 -0500
Subject: [PATCH 1/6] Add test for Arrow decimal groupby variance

---
 pandas/core/groupby/groupby.py                |  4 +-
 pandas/tests/groupby/aggregate/test_cython.py | 61 +++++++++++++++++++
 2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index fe7bf5bbc4c2c..12522b1c78e13 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1758,9 +1758,9 @@ def _cython_agg_general(
 
         data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)
 
-        def array_func(values: ArrayLike) -> ArrayLike:
+        def array_func(values: ArrayLike) -> ArrayLike:  
             try:
-                result = self._grouper._cython_operation(
+                result = self._grouper._cython_operation( 
                     "aggregate",
                     values,
                     how,
diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
index a706ea795a0e2..4dccc660ef13e 100644
--- a/pandas/tests/groupby/aggregate/test_cython.py
+++ b/pandas/tests/groupby/aggregate/test_cython.py
@@ -20,7 +20,10 @@
     Timestamp,
     bdate_range,
 )
+import pyarrow as pa
+import decimal
 import pandas._testing as tm
+import math
 
 
 @pytest.mark.parametrize(
@@ -413,3 +416,61 @@ def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
 
     result = grouped["col"].aggregate(op_name)
     assert result.dtype == expected_dtype
+
+#testing groupby.var() when called with pyarrow datatype 
+
+@pytest.mark.parametrize("with_na", [False, True])
+def test_groupby_var_arrow_decimal(with_na):
+    # Create Arrow-backed decimal Series
+    data = pd.Series(
+        [
+            decimal.Decimal("123.000"),
+            decimal.Decimal("12.000"),
+            decimal.Decimal("5.5"),
+            decimal.Decimal("7.25")
+        ],
+        dtype=pd.ArrowDtype(pa.decimal128(6, 3))
+    )
+
+    if with_na:
+        data.iloc[3] = pd.NA  # introduce a missing value
+
+    df = DataFrame({"key": ["a", "a", "b", "b"], "col": data})
+    grouped = df.groupby("key")
+
+    # Perform the aggregation using .var() (calls _cython_agg_general internally)
+    result = grouped.var()#it correctly converts it to double[pyarrow]
+
+
+    # Check dtype is still Arrow double
+    expected_dtype = pd.ArrowDtype(pa.float64())  
+    assert isinstance(result["col"].dtype, pd.ArrowDtype)
+    assert result["col"].dtype == expected_dtype
+ 
+
+    # Compute expected variance manually for group "a"
+    vals_a = [123.0, 12.0]  # convert to float
+    if with_na:
+        vals_b = [5.5]  # single value → var is NA
+    else:
+        vals_b = [5.5, 7.25]
+
+    # Compute variance using pandas (float)
+    expected_var_a = pd.Series(vals_a).var()
+    expected_var_b = pd.Series(vals_b).var() if len(vals_b) > 1 else pd.NA
+
+    # Helper function for float comparison with NA support
+    def _almost_equal_or_na(a, b, tol=1e-12):
+        if pd.isna(a) and pd.isna(b):
+            return True
+        return math.isclose(float(a), float(b), rel_tol=tol, abs_tol=tol)
+
+    # Compare the DataFrame result
+    assert _almost_equal_or_na(result.loc["a", "col"], expected_var_a)
+    assert _almost_equal_or_na(result.loc["b", "col"], expected_var_b)
+
+    # Also test the SeriesGroupBy path
+    result_series = grouped["col"].var()
+    assert _almost_equal_or_na(result_series.loc["a"], expected_var_a)
+    assert _almost_equal_or_na(result_series.loc["b"], expected_var_b)
+

From d2d716a845582a42bcb455df496b88e3a477cc7f Mon Sep 17 00:00:00 2001
From: nour-taqatqa <nourtaqatqa2025@u.northwestern.edu>
Date: Sun, 19 Oct 2025 11:18:14 -0500
Subject: [PATCH 2/6] edited comment for test\

---
 pandas/tests/groupby/aggregate/test_cython.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
index 4dccc660ef13e..93c9eb89327ad 100644
--- a/pandas/tests/groupby/aggregate/test_cython.py
+++ b/pandas/tests/groupby/aggregate/test_cython.py
@@ -417,10 +417,10 @@ def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
     result = grouped["col"].aggregate(op_name)
     assert result.dtype == expected_dtype
 
-#testing groupby.var() when called with pyarrow datatype 
 
 @pytest.mark.parametrize("with_na", [False, True])
 def test_groupby_var_arrow_decimal(with_na):
+    #testing groupby.var() when called with pyarrow datatype 
     # Create Arrow-backed decimal Series
     data = pd.Series(
         [

From 2781b3ea7ce9f822378c17eb552ccac79cc48dc4 Mon Sep 17 00:00:00 2001
From: nour-taqatqa <nourtaqatqa2025@u.northwestern.edu>
Date: Tue, 21 Oct 2025 15:40:33 -0500
Subject: [PATCH 3/6] reverted groupby.py to upstream/main

---
 pandas/core/groupby/groupby.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 12522b1c78e13..fe7bf5bbc4c2c 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1758,9 +1758,9 @@ def _cython_agg_general(
 
         data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)
 
-        def array_func(values: ArrayLike) -> ArrayLike:  
+        def array_func(values: ArrayLike) -> ArrayLike:
             try:
-                result = self._grouper._cython_operation( 
+                result = self._grouper._cython_operation(
                     "aggregate",
                     values,
                     how,

From b2baf44573d63a2fb1a1841c0b485ab62af05f51 Mon Sep 17 00:00:00 2001
From: nour-taqatqa <nourtaqatqa2025@u.northwestern.edu>
Date: Tue, 21 Oct 2025 16:38:06 -0500
Subject: [PATCH 4/6] Fix test for Arrow-backed variance and update expected
 dtype

---
 pandas/tests/extension/test_arrow.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index c810f098f15cf..18b0220478594 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -3700,3 +3700,10 @@ def test_pow_with_all_na_float():
     result = s.pow(2)
     expected = pd.Series([pd.NA, pd.NA], dtype="float64[pyarrow]")
     tm.assert_series_equal(result, expected)
+
+def test_groupby_var_arrow_decimal():
+    df = pd.DataFrame({"A": pd.Series([True, True], dtype="bool[pyarrow]"), "B": pd.Series([Decimal(123), Decimal(12)], dtype=pd.ArrowDtype(pa.decimal128(6,3)))})
+    result=df.groupby("A").var().dtypes
+    expected = pd.Series([pd.ArrowDtype(pa.float64())], index=result.index)
+    
+    tm.assert_series_equal(result, expected)
\ No newline at end of file

From 2fce7681be0686c43fb6153301425728d802832b Mon Sep 17 00:00:00 2001
From: nour-taqatqa <nourtaqatqa2025@u.northwestern.edu>
Date: Tue, 21 Oct 2025 16:41:20 -0500
Subject: [PATCH 5/6] Delete old test from test_cython.py

---
 pandas/tests/groupby/aggregate/test_cython.py | 60 +------------------
 1 file changed, 1 insertion(+), 59 deletions(-)

diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
index 93c9eb89327ad..3b57f2ff88f4e 100644
--- a/pandas/tests/groupby/aggregate/test_cython.py
+++ b/pandas/tests/groupby/aggregate/test_cython.py
@@ -415,62 +415,4 @@ def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
     assert result.dtype == expected_dtype
 
     result = grouped["col"].aggregate(op_name)
-    assert result.dtype == expected_dtype
-
-
-@pytest.mark.parametrize("with_na", [False, True])
-def test_groupby_var_arrow_decimal(with_na):
-    #testing groupby.var() when called with pyarrow datatype 
-    # Create Arrow-backed decimal Series
-    data = pd.Series(
-        [
-            decimal.Decimal("123.000"),
-            decimal.Decimal("12.000"),
-            decimal.Decimal("5.5"),
-            decimal.Decimal("7.25")
-        ],
-        dtype=pd.ArrowDtype(pa.decimal128(6, 3))
-    )
-
-    if with_na:
-        data.iloc[3] = pd.NA  # introduce a missing value
-
-    df = DataFrame({"key": ["a", "a", "b", "b"], "col": data})
-    grouped = df.groupby("key")
-
-    # Perform the aggregation using .var() (calls _cython_agg_general internally)
-    result = grouped.var()#it correctly converts it to double[pyarrow]
-
-
-    # Check dtype is still Arrow double
-    expected_dtype = pd.ArrowDtype(pa.float64())  
-    assert isinstance(result["col"].dtype, pd.ArrowDtype)
-    assert result["col"].dtype == expected_dtype
- 
-
-    # Compute expected variance manually for group "a"
-    vals_a = [123.0, 12.0]  # convert to float
-    if with_na:
-        vals_b = [5.5]  # single value → var is NA
-    else:
-        vals_b = [5.5, 7.25]
-
-    # Compute variance using pandas (float)
-    expected_var_a = pd.Series(vals_a).var()
-    expected_var_b = pd.Series(vals_b).var() if len(vals_b) > 1 else pd.NA
-
-    # Helper function for float comparison with NA support
-    def _almost_equal_or_na(a, b, tol=1e-12):
-        if pd.isna(a) and pd.isna(b):
-            return True
-        return math.isclose(float(a), float(b), rel_tol=tol, abs_tol=tol)
-
-    # Compare the DataFrame result
-    assert _almost_equal_or_na(result.loc["a", "col"], expected_var_a)
-    assert _almost_equal_or_na(result.loc["b", "col"], expected_var_b)
-
-    # Also test the SeriesGroupBy path
-    result_series = grouped["col"].var()
-    assert _almost_equal_or_na(result_series.loc["a"], expected_var_a)
-    assert _almost_equal_or_na(result_series.loc["b"], expected_var_b)
-
+    assert result.dtype == expected_dtype
\ No newline at end of file

From 537a152a9ee6334dde87f5676090f35ed6c75f95 Mon Sep 17 00:00:00 2001
From: nour-taqatqa <nourtaqatqa2025@u.northwestern.edu>
Date: Tue, 21 Oct 2025 16:44:51 -0500
Subject: [PATCH 6/6] Revert test_cython.py to upstream/main

---
 pandas/tests/groupby/aggregate/test_cython.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
index 3b57f2ff88f4e..a706ea795a0e2 100644
--- a/pandas/tests/groupby/aggregate/test_cython.py
+++ b/pandas/tests/groupby/aggregate/test_cython.py
@@ -20,10 +20,7 @@
     Timestamp,
     bdate_range,
 )
-import pyarrow as pa
-import decimal
 import pandas._testing as tm
-import math
 
 
 @pytest.mark.parametrize(
@@ -415,4 +412,4 @@ def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
     assert result.dtype == expected_dtype
 
     result = grouped["col"].aggregate(op_name)
-    assert result.dtype == expected_dtype
\ No newline at end of file
+    assert result.dtype == expected_dtype