[SPARK-40337][PS] Implement SeriesGroupBy.describe in pandas API on Spark

devin-petersohn · HyukjinKwon · commit d7d22808a814 · 2026-05-07T07:12:19.000+09:00
### What changes were proposed in this pull request? Implement `SeriesGroupBy.describe()` in the pandas API on Spark ### Why are the changes needed? Missing API coverage ### Does this PR introduce _any_ user-facing change? Yes ### How was this patch tested? Unit tests ### Was this patch authored or co-authored using generative AI tooling? Generated-by: Claude Code (claude-opus-4-6) Closes #55686 from devin-petersohn/SPARK-40337-series-groupby-describe. Authored-by: Devin Petersohn <devin.petersohn@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org> (cherry picked from commit 11d51c7) Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/python/docs/source/reference/pyspark.pandas/groupby.rst b/python/docs/source/reference/pyspark.pandas/groupby.rst
@@ -104,6 +104,7 @@ The following methods are available only for `SeriesGroupBy` objects.
 .. autosummary::
    :toctree: api/
 
+   SeriesGroupBy.describe
    SeriesGroupBy.nsmallest
    SeriesGroupBy.nlargest
    SeriesGroupBy.value_counts
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
@@ -4371,6 +4371,76 @@ def _handle_output(
         else:
             return first_series(psdf).rename(self._psser.name)
 
+    def describe(self) -> DataFrame:
+        """
+        Generate descriptive statistics that summarize the central tendency,
+        dispersion and shape of a dataset's distribution, excluding
+        ``NaN`` values.
+
+        .. versionadded:: 4.3.0
+
+        .. note:: Unlike pandas, the percentiles in pandas-on-Spark are based upon
+            approximate percentile computation because computing percentiles
+            across a large dataset is extremely expensive.
+
+        Returns
+        -------
+        DataFrame
+            Summary statistics for each group and the series values.
+
+        See Also
+        --------
+        DataFrame.describe
+
+        Examples
+        --------
+        >>> df = ps.DataFrame({'a': [1, 1, 3], 'b': [4, 5, 6]})
+        >>> df.groupby('a')['b'].describe().sort_index()  # doctest: +NORMALIZE_WHITESPACE
+           count  mean       std  min  25%  50%  75%  max
+        a
+        1    2.0   4.5  0.707107  4.0  4.0  4.0  5.0  5.0
+        3    1.0   6.0       NaN  6.0  6.0  6.0  6.0  6.0
+        """
+        # NOTE: keep in sync with GroupBy.describe
+        if isinstance(self._agg_columns[0].spark.data_type, StringType):
+            raise NotImplementedError(
+                "SeriesGroupBy.describe() doesn't support for string type for now"
+            )
+
+        # Use the base GroupBy.aggregate to bypass SeriesGroupBy's unsupported override.
+        psdf = GroupBy.aggregate(self, ["count", "mean", "std", "min", "quartiles", "max"])
+        sdf = psdf._internal.spark_frame
+        agg_column_labels = [col._column_label for col in self._agg_columns]
+        formatted_percentiles = ["25%", "50%", "75%"]
+
+        # Split "quartiles" columns into first, second, and third quartiles.
+        for label in agg_column_labels:
+            quartiles_col = name_like_string(tuple(list(label) + ["quartiles"]))
+            for i, percentile in enumerate(formatted_percentiles):
+                sdf = sdf.withColumn(
+                    name_like_string(tuple(list(label) + [percentile])),
+                    scol_for(sdf, quartiles_col)[i],
+                )
+            sdf = sdf.drop(quartiles_col)
+
+        # Build single-level column labels: ["count", "mean", "std", "min", "25%", ...]
+        stats = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
+        # For SeriesGroupBy there is only one agg column label.
+        label = agg_column_labels[0]
+        multi_column_labels = [tuple(list(label) + [s]) for s in stats]
+        data_columns = [name_like_string(cl) for cl in multi_column_labels]
+
+        internal = psdf._internal.copy(
+            spark_frame=sdf,
+            column_labels=[(s,) for s in stats],
+            data_spark_columns=[scol_for(sdf, col) for col in data_columns],
+            data_fields=None,
+            column_label_names=None,
+        )
+
+        # Cast columns to "float64" to match pandas.SeriesGroupBy.describe().
+        return DataFrame(internal).astype("float64")
+
     def agg(self, *args: Any, **kwargs: Any) -> None:
         return MissingPandasLikeSeriesGroupBy.agg(self, *args, **kwargs)
 
diff --git a/python/pyspark/pandas/missing/groupby.py b/python/pyspark/pandas/missing/groupby.py
@@ -84,7 +84,6 @@ class MissingPandasLikeSeriesGroupBy:
     # Functions
     agg = _unsupported_function("agg")
     aggregate = _unsupported_function("aggregate")
-    describe = _unsupported_function("describe")
     ngroup = _unsupported_function("ngroup")
     ohlc = _unsupported_function("ohlc")
     pct_change = _unsupported_function("pct_change")
diff --git a/python/pyspark/pandas/tests/groupby/test_describe.py b/python/pyspark/pandas/tests/groupby/test_describe.py
@@ -111,6 +111,77 @@ def test_describe(self):
             quantile_pdf.rename(columns="{:.0%}".format, level=2),
         )
 
+    def _check_series_groupby_describe(self, pdf, groupby_col, value_col):
+        """Helper to check SeriesGroupBy.describe against pandas."""
+        psdf = ps.from_pandas(pdf)
+
+        describe_pdf = pdf.groupby(groupby_col)[value_col].describe().sort_index()
+        describe_psdf = psdf.groupby(groupby_col)[value_col].describe().sort_index()
+
+        non_percentile_stats = ["count", "mean", "std", "min", "max"]
+        formatted_percentiles = ["25%", "50%", "75%"]
+        percentiles = [0.25, 0.5, 0.75]
+
+        # 1. Check non-percentile stats.
+        self.assert_eq(
+            describe_psdf[non_percentile_stats],
+            describe_pdf[non_percentile_stats],
+            check_exact=False,
+        )
+
+        # 2. Check percentile stats (approximate percentiles use nearest interpolation).
+        quantile_pdf = (
+            pdf.groupby(groupby_col)[value_col]
+            .quantile(percentiles, interpolation="nearest")
+            .unstack(level=1)
+            .astype(float)
+        )
+        quantile_pdf.columns = ["{:.0%}".format(p) for p in percentiles]
+        self.assert_eq(
+            describe_psdf[formatted_percentiles],
+            quantile_pdf,
+        )
+
+    def test_series_groupby_describe(self):
+        # Basic numeric case
+        self._check_series_groupby_describe(
+            pd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6]}), "a", "b"
+        )
+
+        # Floats and negatives with larger groups
+        self._check_series_groupby_describe(
+            pd.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [-1.5, 2.0, 3.5, 10.0, 20.0, 30.0]}),
+            "a",
+            "b",
+        )
+
+        # Same-value groups: std should be 0.0, not NaN
+        pdf = pd.DataFrame({"a": [1, 1, 2], "b": [5, 5, 10]})
+        psdf = ps.from_pandas(pdf)
+        describe_psdf = psdf.groupby("a")["b"].describe().sort_index()
+        describe_pdf = pdf.groupby("a")["b"].describe().sort_index()
+        # Group a=1 has two identical values, so std must be 0.0
+        self.assertEqual(describe_psdf.loc[1, "std"], 0.0)
+        self.assert_eq(
+            describe_psdf[["count", "mean", "std", "min", "max"]],
+            describe_pdf[["count", "mean", "std", "min", "max"]],
+            check_exact=False,
+        )
+
+        # String group key with numeric values -- check both non-percentile and percentile stats
+        self._check_series_groupby_describe(
+            pd.DataFrame({"a": ["x", "x", "y"], "b": [4, 5, 6]}), "a", "b"
+        )
+
+        # String type series should raise NotImplementedError with a descriptive message
+        pdf = pd.DataFrame({"a": ["x", "x", "y"], "b": ["d", "e", "f"]})
+        psdf = ps.from_pandas(pdf)
+        self.assertRaisesRegex(
+            NotImplementedError,
+            "doesn't support for string type",
+            lambda: psdf.groupby("a")["b"].describe(),
+        )
+
 
 class GroupbyDescribeTests(
     GroupbyDescribeMixin,