Skip to content

TST (string dtype): fix groupby xfails with using_infer_string + update error message #59430

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Nov 8, 2024
14 changes: 14 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2303,6 +2303,20 @@ def _groupby_op(
**kwargs,
):
if isinstance(self.dtype, StringDtype):
if how in [
"prod",
"mean",
"median",
"cumsum",
"cumprod",
"std",
"sem",
"var",
"skew",
]:
raise TypeError(
f"dtype '{self.dtype}' does not support operation '{how}'"
)
return super()._groupby_op(
how=how,
has_dropped_na=has_dropped_na,
Expand Down
14 changes: 14 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2608,6 +2608,20 @@ def _groupby_op(
# GH#43682
if isinstance(self.dtype, StringDtype):
# StringArray
if op.how in [
"prod",
"mean",
"median",
"cumsum",
"cumprod",
"std",
"sem",
"var",
"skew",
]:
raise TypeError(
f"dtype '{self.dtype}' does not support operation '{how}'"
)
if op.how not in ["any", "all"]:
# Fail early to avoid conversion to object
op._get_cython_function(op.kind, op.how, np.dtype(object), False)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4162,9 +4162,9 @@ def quantile(
starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups)

def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
if is_object_dtype(vals.dtype):
if isinstance(vals.dtype, StringDtype) or is_object_dtype(vals.dtype):
raise TypeError(
"'quantile' cannot be performed against 'object' dtypes!"
f"dtype '{vals.dtype}' does not support operation 'quantile'"
)

inference: DtypeObj | None = None
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2113,7 +2113,7 @@ def test_unstack_period_frame(self):
@pytest.mark.filterwarnings(
"ignore:The previous implementation of stack is deprecated"
)
def test_stack_multiple_bug(self, future_stack):
def test_stack_multiple_bug(self, future_stack, using_infer_string):
# bug when some uniques are not present in the data GH#3170
id_col = ([1] * 3) + ([2] * 3)
name = (["a"] * 3) + (["b"] * 3)
Expand All @@ -2125,6 +2125,8 @@ def test_stack_multiple_bug(self, future_stack):
multi.columns.name = "Params"
unst = multi.unstack("ID")
msg = re.escape("agg function failed [how->mean,dtype->")
if using_infer_string:
msg = "dtype 'str' does not support operation 'mean'"
with pytest.raises(TypeError, match=msg):
unst.resample("W-THU").mean()
down = unst.resample("W-THU").mean(numeric_only=True)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,11 @@ def test_cython_agg_return_dict():

def test_cython_fail_agg():
dr = bdate_range("1/1/2000", periods=50)
ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr)

grouped = ts.groupby(lambda x: x.month)
summed = grouped.sum()
expected = grouped.agg(np.sum)
expected = grouped.agg(np.sum).astype(object)
Comment on lines 149 to +155
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was there a specific reason you added an explicit dtype=object here (since it seems you only added this in the last commit, after updating for sum() being implemented, so now this is actually no longer needed, I think) ?

tm.assert_series_equal(summed, expected)


Expand Down
9 changes: 4 additions & 5 deletions pandas/tests/groupby/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby,
def test_quantile_raises():
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])

with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
msg = "dtype 'object' does not support operation 'quantile'"
with pytest.raises(TypeError, match=msg):
df.groupby("key").quantile()


Expand Down Expand Up @@ -241,7 +242,6 @@ def test_groupby_quantile_nullable_array(values, q):
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
@pytest.mark.parametrize("numeric_only", [True, False])
def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
Expand All @@ -251,9 +251,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
expected = df.groupby("a")[["b"]].quantile(q)
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(
TypeError, match="'quantile' cannot be performed against 'object' dtypes!"
):
msg = "dtype '.*' does not support operation 'quantile'"
with pytest.raises(TypeError, match=msg):
df.groupby("a").quantile(q, numeric_only=numeric_only)


Expand Down
56 changes: 42 additions & 14 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ def test_frame_multi_key_function_list():
tm.assert_frame_equal(agged, expected)


def test_frame_multi_key_function_list_partial_failure():
def test_frame_multi_key_function_list_partial_failure(using_infer_string):
data = DataFrame(
{
"A": [
Expand Down Expand Up @@ -476,6 +476,8 @@ def test_frame_multi_key_function_list_partial_failure():
grouped = data.groupby(["A", "B"])
funcs = ["mean", "std"]
msg = re.escape("agg function failed [how->mean,dtype->")
if using_infer_string:
msg = "dtype 'str' does not support operation 'mean'"
with pytest.raises(TypeError, match=msg):
grouped.agg(funcs)

Expand Down Expand Up @@ -662,9 +664,11 @@ def test_groupby_multi_corner(df):
tm.assert_frame_equal(agged, expected)


def test_raises_on_nuisance(df):
def test_raises_on_nuisance(df, using_infer_string):
grouped = df.groupby("A")
msg = re.escape("agg function failed [how->mean,dtype->")
if using_infer_string:
msg = "dtype 'str' does not support operation 'mean'"
with pytest.raises(TypeError, match=msg):
grouped.agg("mean")
with pytest.raises(TypeError, match=msg):
Expand Down Expand Up @@ -699,15 +703,18 @@ def test_keep_nuisance_agg(df, agg_function):
["sum", "mean", "prod", "std", "var", "sem", "median"],
)
@pytest.mark.parametrize("numeric_only", [True, False])
def test_omit_nuisance_agg(df, agg_function, numeric_only):
def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string):
# GH 38774, GH 38815
grouped = df.groupby("A")

no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
if agg_function in no_drop_nuisance and not numeric_only:
# Added numeric_only as part of GH#46560; these do not drop nuisance
# columns when numeric_only is False
if agg_function in ("std", "sem"):
if using_infer_string:
msg = f"dtype 'str' does not support operation '{agg_function}'"
klass = TypeError
elif agg_function in ("std", "sem"):
klass = ValueError
msg = "could not convert string to float: 'one'"
else:
Expand All @@ -728,16 +735,24 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
tm.assert_frame_equal(result, expected)


def test_raise_on_nuisance_python_single(df):
def test_raise_on_nuisance_python_single(df, using_infer_string):
# GH 38815
grouped = df.groupby("A")
with pytest.raises(ValueError, match="could not convert"):

err = ValueError
msg = "could not convert"
if using_infer_string:
err = TypeError
msg = "dtype 'str' does not support operation 'skew'"
with pytest.raises(err, match=msg):
grouped.skew()


def test_raise_on_nuisance_python_multiple(three_group):
def test_raise_on_nuisance_python_multiple(three_group, using_infer_string):
grouped = three_group.groupby(["A", "B"])
msg = re.escape("agg function failed [how->mean,dtype->")
if using_infer_string:
msg = "dtype 'str' does not support operation 'mean'"
with pytest.raises(TypeError, match=msg):
grouped.agg("mean")
with pytest.raises(TypeError, match=msg):
Expand Down Expand Up @@ -775,12 +790,16 @@ def test_nonsense_func():
df.groupby(lambda x: x + "foo")


def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data):
def test_wrap_aggregated_output_multindex(
multiindex_dataframe_random_data, using_infer_string
):
df = multiindex_dataframe_random_data.T
df["baz", "two"] = "peekaboo"

keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
msg = re.escape("agg function failed [how->mean,dtype->")
if using_infer_string:
msg = "dtype 'str' does not support operation 'mean'"
with pytest.raises(TypeError, match=msg):
df.groupby(keys).agg("mean")
agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean")
Expand Down Expand Up @@ -960,8 +979,10 @@ def test_groupby_with_hier_columns():

def test_grouping_ndarray(df):
grouped = df.groupby(df["A"].values)
grouped2 = df.groupby(df["A"].rename(None))

result = grouped.sum()
expected = df.groupby(df["A"].rename(None)).sum()
expected = grouped2.sum()
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -1457,8 +1478,8 @@ def test_no_dummy_key_names(df):
result = df.groupby(df["A"].values).sum()
assert result.index.name is None

result = df.groupby([df["A"].values, df["B"].values]).sum()
assert result.index.names == (None, None)
result2 = df.groupby([df["A"].values, df["B"].values]).sum()
assert result2.index.names == (None, None)


def test_groupby_sort_multiindex_series():
Expand Down Expand Up @@ -1761,6 +1782,7 @@ def get_categorical_invalid_expected():
is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype)
is_dt64 = df.dtypes.iloc[0].kind == "M"
is_cat = isinstance(values, Categorical)
is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype)

if (
isinstance(values, Categorical)
Expand All @@ -1785,13 +1807,15 @@ def get_categorical_invalid_expected():

if op in ["prod", "sum", "skew"]:
# ops that require more than just ordered-ness
if is_dt64 or is_cat or is_per:
if is_dt64 or is_cat or is_per or (is_str and op != "sum"):
# GH#41291
# datetime64 -> prod and sum are invalid
if is_dt64:
msg = "datetime64 type does not support"
elif is_per:
msg = "Period type does not support"
elif is_str:
msg = f"dtype 'str' does not support operation '{op}'"
else:
msg = "category type does not support"
if op == "skew":
Expand Down Expand Up @@ -2714,7 +2738,7 @@ def test_obj_with_exclusions_duplicate_columns():
def test_groupby_numeric_only_std_no_result(numeric_only):
# GH 51080
dicts_non_numeric = [{"a": "foo", "b": "bar"}, {"a": "car", "b": "dar"}]
df = DataFrame(dicts_non_numeric)
df = DataFrame(dicts_non_numeric, dtype=object)
dfgb = df.groupby("a", as_index=False, sort=False)

if numeric_only:
Expand Down Expand Up @@ -2773,10 +2797,14 @@ def test_grouping_with_categorical_interval_columns():
def test_groupby_sum_on_nan_should_return_nan(bug_var):
# GH 24196
df = DataFrame({"A": [bug_var, bug_var, bug_var, np.nan]})
if isinstance(bug_var, str):
df = df.astype(object)
dfgb = df.groupby(lambda x: x)
result = dfgb.sum(min_count=1)

expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"])
expected_df = DataFrame(
[bug_var, bug_var, bug_var, None], columns=["A"], dtype=df["A"].dtype
)
tm.assert_frame_equal(result, expected_df)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_groupby_subclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def test_groupby_resample_preserves_subclass(obj):

df = obj(
{
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
"Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object),
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
datetime(2013, 9, 1, 13, 0),
Expand Down
20 changes: 14 additions & 6 deletions pandas/tests/groupby/test_numeric_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def df(self):
"group": [1, 1, 2],
"int": [1, 2, 3],
"float": [4.0, 5.0, 6.0],
"string": list("abc"),
"string": Series(["a", "b", "c"], dtype="str"),
"object": Series(["a", "b", "c"], dtype=object),
"category_string": Series(list("abc")).astype("category"),
"category_int": [7, 8, 9],
"datetime": date_range("20130101", periods=3),
Expand All @@ -40,6 +41,7 @@ def df(self):
"int",
"float",
"string",
"object",
"category_string",
"category_int",
"datetime",
Expand Down Expand Up @@ -112,6 +114,7 @@ def test_first_last(self, df, method):
"int",
"float",
"string",
"object",
"category_string",
"category_int",
"datetime",
Expand Down Expand Up @@ -159,7 +162,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):

# object dtypes for transformations are not implemented in Cython and
# have no Python fallback
exception = NotImplementedError if method.startswith("cum") else TypeError
exception = (
(NotImplementedError, TypeError) if method.startswith("cum") else TypeError
)

if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
# The methods default to numeric_only=False and raise TypeError
Expand All @@ -170,6 +175,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
re.escape(f"agg function failed [how->{method},dtype->object]"),
# cumsum/cummin/cummax/cumprod
"function is not implemented for this dtype",
f"dtype 'str' does not support operation '{method}'",
]
)
with pytest.raises(exception, match=msg):
Expand All @@ -180,7 +186,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
"category type does not support sum operations",
re.escape(f"agg function failed [how->{method},dtype->object]"),
re.escape(f"agg function failed [how->{method},dtype->string]"),
re.escape(f"agg function failed [how->{method},dtype->str]"),
f"dtype 'str' does not support operation '{method}'",
]
)
with pytest.raises(exception, match=msg):
Expand All @@ -198,7 +204,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
f"Cannot perform {method} with non-ordered Categorical",
re.escape(f"agg function failed [how->{method},dtype->object]"),
re.escape(f"agg function failed [how->{method},dtype->string]"),
re.escape(f"agg function failed [how->{method},dtype->str]"),
f"dtype 'str' does not support operation '{method}'",
]
)
with pytest.raises(exception, match=msg):
Expand Down Expand Up @@ -299,7 +305,9 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys):
re.escape(f"agg function failed [how->{kernel},dtype->object]"),
]
)
if kernel == "idxmin":
if kernel == "quantile":
msg = "dtype 'object' does not support operation 'quantile'"
elif kernel == "idxmin":
msg = "'<' not supported between instances of 'type' and 'type'"
elif kernel == "idxmax":
msg = "'>' not supported between instances of 'type' and 'type'"
Expand Down Expand Up @@ -379,7 +387,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request):
# that succeed should not be allowed to fail (without deprecation, at least)
if groupby_func in fails_on_numeric_object and dtype is object:
if groupby_func == "quantile":
msg = "cannot be performed against 'object' dtypes"
msg = "dtype 'object' does not support operation 'quantile'"
else:
msg = "is not supported for object dtype"
with pytest.raises(TypeError, match=msg):
Expand Down
Loading
Loading