diff --git a/python/docs/source/reference/pyspark.pandas/groupby.rst b/python/docs/source/reference/pyspark.pandas/groupby.rst index 7a0c771e8caac..f86a7572666b4 100644 --- a/python/docs/source/reference/pyspark.pandas/groupby.rst +++ b/python/docs/source/reference/pyspark.pandas/groupby.rst @@ -104,6 +104,7 @@ The following methods are available only for `SeriesGroupBy` objects. .. autosummary:: :toctree: api/ + SeriesGroupBy.describe SeriesGroupBy.nsmallest SeriesGroupBy.nlargest SeriesGroupBy.value_counts diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index b0879c9410f97..c5b81f05cc578 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -4371,6 +4371,76 @@ def _handle_output( else: return first_series(psdf).rename(self._psser.name) + def describe(self) -> DataFrame: + """ + Generate descriptive statistics that summarize the central tendency, + dispersion and shape of a dataset's distribution, excluding + ``NaN`` values. + + .. versionadded:: 4.3.0 + + .. note:: Unlike pandas, the percentiles in pandas-on-Spark are based upon + approximate percentile computation because computing percentiles + across a large dataset is extremely expensive. + + Returns + ------- + DataFrame + Summary statistics for each group and the series values. + + See Also + -------- + DataFrame.describe + + Examples + -------- + >>> df = ps.DataFrame({'a': [1, 1, 3], 'b': [4, 5, 6]}) + >>> df.groupby('a')['b'].describe().sort_index() # doctest: +NORMALIZE_WHITESPACE + count mean std min 25% 50% 75% max + a + 1 2.0 4.5 0.707107 4.0 4.0 4.0 5.0 5.0 + 3 1.0 6.0 NaN 6.0 6.0 6.0 6.0 6.0 + """ + # NOTE: keep in sync with GroupBy.describe + if isinstance(self._agg_columns[0].spark.data_type, StringType): + raise NotImplementedError( + "SeriesGroupBy.describe() doesn't support for string type for now" + ) + + # Use the base GroupBy.aggregate to bypass SeriesGroupBy's unsupported override. + psdf = GroupBy.aggregate(self, ["count", "mean", "std", "min", "quartiles", "max"]) + sdf = psdf._internal.spark_frame + agg_column_labels = [col._column_label for col in self._agg_columns] + formatted_percentiles = ["25%", "50%", "75%"] + + # Split "quartiles" columns into first, second, and third quartiles. + for label in agg_column_labels: + quartiles_col = name_like_string(tuple(list(label) + ["quartiles"])) + for i, percentile in enumerate(formatted_percentiles): + sdf = sdf.withColumn( + name_like_string(tuple(list(label) + [percentile])), + scol_for(sdf, quartiles_col)[i], + ) + sdf = sdf.drop(quartiles_col) + + # Build single-level column labels: ["count", "mean", "std", "min", "25%", ...] + stats = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] + # For SeriesGroupBy there is only one agg column label. + label = agg_column_labels[0] + multi_column_labels = [tuple(list(label) + [s]) for s in stats] + data_columns = [name_like_string(cl) for cl in multi_column_labels] + + internal = psdf._internal.copy( + spark_frame=sdf, + column_labels=[(s,) for s in stats], + data_spark_columns=[scol_for(sdf, col) for col in data_columns], + data_fields=None, + column_label_names=None, + ) + + # Cast columns to "float64" to match pandas.SeriesGroupBy.describe(). + return DataFrame(internal).astype("float64") + def agg(self, *args: Any, **kwargs: Any) -> None: return MissingPandasLikeSeriesGroupBy.agg(self, *args, **kwargs) diff --git a/python/pyspark/pandas/missing/groupby.py b/python/pyspark/pandas/missing/groupby.py index a6b672df916ca..04891006dee75 100644 --- a/python/pyspark/pandas/missing/groupby.py +++ b/python/pyspark/pandas/missing/groupby.py @@ -84,7 +84,6 @@ class MissingPandasLikeSeriesGroupBy: # Functions agg = _unsupported_function("agg") aggregate = _unsupported_function("aggregate") - describe = _unsupported_function("describe") ngroup = _unsupported_function("ngroup") ohlc = _unsupported_function("ohlc") pct_change = _unsupported_function("pct_change") diff --git a/python/pyspark/pandas/tests/groupby/test_describe.py b/python/pyspark/pandas/tests/groupby/test_describe.py index a0ffd49ed4f85..026ca8466bca6 100644 --- a/python/pyspark/pandas/tests/groupby/test_describe.py +++ b/python/pyspark/pandas/tests/groupby/test_describe.py @@ -111,6 +111,77 @@ def test_describe(self): quantile_pdf.rename(columns="{:.0%}".format, level=2), ) + def _check_series_groupby_describe(self, pdf, groupby_col, value_col): + """Helper to check SeriesGroupBy.describe against pandas.""" + psdf = ps.from_pandas(pdf) + + describe_pdf = pdf.groupby(groupby_col)[value_col].describe().sort_index() + describe_psdf = psdf.groupby(groupby_col)[value_col].describe().sort_index() + + non_percentile_stats = ["count", "mean", "std", "min", "max"] + formatted_percentiles = ["25%", "50%", "75%"] + percentiles = [0.25, 0.5, 0.75] + + # 1. Check non-percentile stats. + self.assert_eq( + describe_psdf[non_percentile_stats], + describe_pdf[non_percentile_stats], + check_exact=False, + ) + + # 2. Check percentile stats (approximate percentiles use nearest interpolation). + quantile_pdf = ( + pdf.groupby(groupby_col)[value_col] + .quantile(percentiles, interpolation="nearest") + .unstack(level=1) + .astype(float) + ) + quantile_pdf.columns = ["{:.0%}".format(p) for p in percentiles] + self.assert_eq( + describe_psdf[formatted_percentiles], + quantile_pdf, + ) + + def test_series_groupby_describe(self): + # Basic numeric case + self._check_series_groupby_describe( + pd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6]}), "a", "b" + ) + + # Floats and negatives with larger groups + self._check_series_groupby_describe( + pd.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [-1.5, 2.0, 3.5, 10.0, 20.0, 30.0]}), + "a", + "b", + ) + + # Same-value groups: std should be 0.0, not NaN + pdf = pd.DataFrame({"a": [1, 1, 2], "b": [5, 5, 10]}) + psdf = ps.from_pandas(pdf) + describe_psdf = psdf.groupby("a")["b"].describe().sort_index() + describe_pdf = pdf.groupby("a")["b"].describe().sort_index() + # Group a=1 has two identical values, so std must be 0.0 + self.assertEqual(describe_psdf.loc[1, "std"], 0.0) + self.assert_eq( + describe_psdf[["count", "mean", "std", "min", "max"]], + describe_pdf[["count", "mean", "std", "min", "max"]], + check_exact=False, + ) + + # String group key with numeric values -- check both non-percentile and percentile stats + self._check_series_groupby_describe( + pd.DataFrame({"a": ["x", "x", "y"], "b": [4, 5, 6]}), "a", "b" + ) + + # String type series should raise NotImplementedError with a descriptive message + pdf = pd.DataFrame({"a": ["x", "x", "y"], "b": ["d", "e", "f"]}) + psdf = ps.from_pandas(pdf) + self.assertRaisesRegex( + NotImplementedError, + "doesn't support for string type", + lambda: psdf.groupby("a")["b"].describe(), + ) + class GroupbyDescribeTests( GroupbyDescribeMixin,