Skip to content

Commit d7d2280

Browse files
devin-petersohnHyukjinKwon
authored andcommitted
[SPARK-40337][PS] Implement SeriesGroupBy.describe in pandas API on Spark
### What changes were proposed in this pull request? Implement `SeriesGroupBy.describe()` in the pandas API on Spark ### Why are the changes needed? Missing API coverage ### Does this PR introduce _any_ user-facing change? Yes ### How was this patch tested? Unit tests ### Was this patch authored or co-authored using generative AI tooling? Generated-by: Claude Code (claude-opus-4-6) Closes #55686 from devin-petersohn/SPARK-40337-series-groupby-describe. Authored-by: Devin Petersohn <devin.petersohn@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org> (cherry picked from commit 11d51c7) Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
1 parent 9f6486e commit d7d2280

4 files changed

Lines changed: 142 additions & 1 deletion

File tree

python/docs/source/reference/pyspark.pandas/groupby.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ The following methods are available only for `SeriesGroupBy` objects.
104104
.. autosummary::
105105
:toctree: api/
106106

107+
SeriesGroupBy.describe
107108
SeriesGroupBy.nsmallest
108109
SeriesGroupBy.nlargest
109110
SeriesGroupBy.value_counts

python/pyspark/pandas/groupby.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4371,6 +4371,76 @@ def _handle_output(
43714371
else:
43724372
return first_series(psdf).rename(self._psser.name)
43734373

4374+
def describe(self) -> DataFrame:
4375+
"""
4376+
Generate descriptive statistics that summarize the central tendency,
4377+
dispersion and shape of a dataset's distribution, excluding
4378+
``NaN`` values.
4379+
4380+
.. versionadded:: 4.3.0
4381+
4382+
.. note:: Unlike pandas, the percentiles in pandas-on-Spark are based upon
4383+
approximate percentile computation because computing percentiles
4384+
across a large dataset is extremely expensive.
4385+
4386+
Returns
4387+
-------
4388+
DataFrame
4389+
Summary statistics for each group and the series values.
4390+
4391+
See Also
4392+
--------
4393+
DataFrame.describe
4394+
4395+
Examples
4396+
--------
4397+
>>> df = ps.DataFrame({'a': [1, 1, 3], 'b': [4, 5, 6]})
4398+
>>> df.groupby('a')['b'].describe().sort_index() # doctest: +NORMALIZE_WHITESPACE
4399+
count mean std min 25% 50% 75% max
4400+
a
4401+
1 2.0 4.5 0.707107 4.0 4.0 4.0 5.0 5.0
4402+
3 1.0 6.0 NaN 6.0 6.0 6.0 6.0 6.0
4403+
"""
4404+
# NOTE: keep in sync with GroupBy.describe
4405+
if isinstance(self._agg_columns[0].spark.data_type, StringType):
4406+
raise NotImplementedError(
4407+
"SeriesGroupBy.describe() doesn't support for string type for now"
4408+
)
4409+
4410+
# Use the base GroupBy.aggregate to bypass SeriesGroupBy's unsupported override.
4411+
psdf = GroupBy.aggregate(self, ["count", "mean", "std", "min", "quartiles", "max"])
4412+
sdf = psdf._internal.spark_frame
4413+
agg_column_labels = [col._column_label for col in self._agg_columns]
4414+
formatted_percentiles = ["25%", "50%", "75%"]
4415+
4416+
# Split "quartiles" columns into first, second, and third quartiles.
4417+
for label in agg_column_labels:
4418+
quartiles_col = name_like_string(tuple(list(label) + ["quartiles"]))
4419+
for i, percentile in enumerate(formatted_percentiles):
4420+
sdf = sdf.withColumn(
4421+
name_like_string(tuple(list(label) + [percentile])),
4422+
scol_for(sdf, quartiles_col)[i],
4423+
)
4424+
sdf = sdf.drop(quartiles_col)
4425+
4426+
# Build single-level column labels: ["count", "mean", "std", "min", "25%", ...]
4427+
stats = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
4428+
# For SeriesGroupBy there is only one agg column label.
4429+
label = agg_column_labels[0]
4430+
multi_column_labels = [tuple(list(label) + [s]) for s in stats]
4431+
data_columns = [name_like_string(cl) for cl in multi_column_labels]
4432+
4433+
internal = psdf._internal.copy(
4434+
spark_frame=sdf,
4435+
column_labels=[(s,) for s in stats],
4436+
data_spark_columns=[scol_for(sdf, col) for col in data_columns],
4437+
data_fields=None,
4438+
column_label_names=None,
4439+
)
4440+
4441+
# Cast columns to "float64" to match pandas.SeriesGroupBy.describe().
4442+
return DataFrame(internal).astype("float64")
4443+
43744444
def agg(self, *args: Any, **kwargs: Any) -> None:
43754445
return MissingPandasLikeSeriesGroupBy.agg(self, *args, **kwargs)
43764446

python/pyspark/pandas/missing/groupby.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@ class MissingPandasLikeSeriesGroupBy:
8484
# Functions
8585
agg = _unsupported_function("agg")
8686
aggregate = _unsupported_function("aggregate")
87-
describe = _unsupported_function("describe")
8887
ngroup = _unsupported_function("ngroup")
8988
ohlc = _unsupported_function("ohlc")
9089
pct_change = _unsupported_function("pct_change")

python/pyspark/pandas/tests/groupby/test_describe.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,77 @@ def test_describe(self):
111111
quantile_pdf.rename(columns="{:.0%}".format, level=2),
112112
)
113113

114+
def _check_series_groupby_describe(self, pdf, groupby_col, value_col):
115+
"""Helper to check SeriesGroupBy.describe against pandas."""
116+
psdf = ps.from_pandas(pdf)
117+
118+
describe_pdf = pdf.groupby(groupby_col)[value_col].describe().sort_index()
119+
describe_psdf = psdf.groupby(groupby_col)[value_col].describe().sort_index()
120+
121+
non_percentile_stats = ["count", "mean", "std", "min", "max"]
122+
formatted_percentiles = ["25%", "50%", "75%"]
123+
percentiles = [0.25, 0.5, 0.75]
124+
125+
# 1. Check non-percentile stats.
126+
self.assert_eq(
127+
describe_psdf[non_percentile_stats],
128+
describe_pdf[non_percentile_stats],
129+
check_exact=False,
130+
)
131+
132+
# 2. Check percentile stats (approximate percentiles use nearest interpolation).
133+
quantile_pdf = (
134+
pdf.groupby(groupby_col)[value_col]
135+
.quantile(percentiles, interpolation="nearest")
136+
.unstack(level=1)
137+
.astype(float)
138+
)
139+
quantile_pdf.columns = ["{:.0%}".format(p) for p in percentiles]
140+
self.assert_eq(
141+
describe_psdf[formatted_percentiles],
142+
quantile_pdf,
143+
)
144+
145+
def test_series_groupby_describe(self):
146+
# Basic numeric case
147+
self._check_series_groupby_describe(
148+
pd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6]}), "a", "b"
149+
)
150+
151+
# Floats and negatives with larger groups
152+
self._check_series_groupby_describe(
153+
pd.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [-1.5, 2.0, 3.5, 10.0, 20.0, 30.0]}),
154+
"a",
155+
"b",
156+
)
157+
158+
# Same-value groups: std should be 0.0, not NaN
159+
pdf = pd.DataFrame({"a": [1, 1, 2], "b": [5, 5, 10]})
160+
psdf = ps.from_pandas(pdf)
161+
describe_psdf = psdf.groupby("a")["b"].describe().sort_index()
162+
describe_pdf = pdf.groupby("a")["b"].describe().sort_index()
163+
# Group a=1 has two identical values, so std must be 0.0
164+
self.assertEqual(describe_psdf.loc[1, "std"], 0.0)
165+
self.assert_eq(
166+
describe_psdf[["count", "mean", "std", "min", "max"]],
167+
describe_pdf[["count", "mean", "std", "min", "max"]],
168+
check_exact=False,
169+
)
170+
171+
# String group key with numeric values -- check both non-percentile and percentile stats
172+
self._check_series_groupby_describe(
173+
pd.DataFrame({"a": ["x", "x", "y"], "b": [4, 5, 6]}), "a", "b"
174+
)
175+
176+
# String type series should raise NotImplementedError with a descriptive message
177+
pdf = pd.DataFrame({"a": ["x", "x", "y"], "b": ["d", "e", "f"]})
178+
psdf = ps.from_pandas(pdf)
179+
self.assertRaisesRegex(
180+
NotImplementedError,
181+
"doesn't support for string type",
182+
lambda: psdf.groupby("a")["b"].describe(),
183+
)
184+
114185

115186
class GroupbyDescribeTests(
116187
GroupbyDescribeMixin,

0 commit comments

Comments
 (0)