Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions python/docs/source/reference/pyspark.pandas/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ The following methods are available only for `SeriesGroupBy` objects.
.. autosummary::
:toctree: api/

SeriesGroupBy.describe
SeriesGroupBy.nsmallest
SeriesGroupBy.nlargest
SeriesGroupBy.value_counts
Expand Down
70 changes: 70 additions & 0 deletions python/pyspark/pandas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4371,6 +4371,76 @@ def _handle_output(
else:
return first_series(psdf).rename(self._psser.name)

def describe(self) -> DataFrame:
"""
Generate descriptive statistics that summarize the central tendency,
dispersion and shape of a dataset's distribution, excluding
``NaN`` values.

.. versionadded:: 4.3.0

.. note:: Unlike pandas, the percentiles in pandas-on-Spark are based upon
approximate percentile computation because computing percentiles
across a large dataset is extremely expensive.

Returns
-------
DataFrame
Summary statistics for each group and the series values.

See Also
--------
DataFrame.describe

Examples
--------
>>> df = ps.DataFrame({'a': [1, 1, 3], 'b': [4, 5, 6]})
>>> df.groupby('a')['b'].describe().sort_index() # doctest: +NORMALIZE_WHITESPACE
count mean std min 25% 50% 75% max
a
1 2.0 4.5 0.707107 4.0 4.0 4.0 5.0 5.0
3 1.0 6.0 NaN 6.0 6.0 6.0 6.0 6.0
"""
# NOTE: keep in sync with GroupBy.describe
if isinstance(self._agg_columns[0].spark.data_type, StringType):
raise NotImplementedError(
"SeriesGroupBy.describe() doesn't support for string type for now"
)

# Use the base GroupBy.aggregate to bypass SeriesGroupBy's unsupported override.
psdf = GroupBy.aggregate(self, ["count", "mean", "std", "min", "quartiles", "max"])
sdf = psdf._internal.spark_frame
agg_column_labels = [col._column_label for col in self._agg_columns]
formatted_percentiles = ["25%", "50%", "75%"]

# Split "quartiles" columns into first, second, and third quartiles.
for label in agg_column_labels:
quartiles_col = name_like_string(tuple(list(label) + ["quartiles"]))
for i, percentile in enumerate(formatted_percentiles):
sdf = sdf.withColumn(
name_like_string(tuple(list(label) + [percentile])),
scol_for(sdf, quartiles_col)[i],
)
sdf = sdf.drop(quartiles_col)

# Build single-level column labels: ["count", "mean", "std", "min", "25%", ...]
stats = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
# For SeriesGroupBy there is only one agg column label.
label = agg_column_labels[0]
multi_column_labels = [tuple(list(label) + [s]) for s in stats]
data_columns = [name_like_string(cl) for cl in multi_column_labels]

internal = psdf._internal.copy(
spark_frame=sdf,
column_labels=[(s,) for s in stats],
data_spark_columns=[scol_for(sdf, col) for col in data_columns],
data_fields=None,
column_label_names=None,
)

# Cast columns to "float64" to match pandas.SeriesGroupBy.describe().
return DataFrame(internal).astype("float64")

def agg(self, *args: Any, **kwargs: Any) -> None:
return MissingPandasLikeSeriesGroupBy.agg(self, *args, **kwargs)

Expand Down
1 change: 0 additions & 1 deletion python/pyspark/pandas/missing/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ class MissingPandasLikeSeriesGroupBy:
# Functions
agg = _unsupported_function("agg")
aggregate = _unsupported_function("aggregate")
describe = _unsupported_function("describe")
ngroup = _unsupported_function("ngroup")
ohlc = _unsupported_function("ohlc")
pct_change = _unsupported_function("pct_change")
Expand Down
71 changes: 71 additions & 0 deletions python/pyspark/pandas/tests/groupby/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,77 @@ def test_describe(self):
quantile_pdf.rename(columns="{:.0%}".format, level=2),
)

def _check_series_groupby_describe(self, pdf, groupby_col, value_col):
"""Helper to check SeriesGroupBy.describe against pandas."""
psdf = ps.from_pandas(pdf)

describe_pdf = pdf.groupby(groupby_col)[value_col].describe().sort_index()
describe_psdf = psdf.groupby(groupby_col)[value_col].describe().sort_index()

non_percentile_stats = ["count", "mean", "std", "min", "max"]
formatted_percentiles = ["25%", "50%", "75%"]
percentiles = [0.25, 0.5, 0.75]

# 1. Check non-percentile stats.
self.assert_eq(
describe_psdf[non_percentile_stats],
describe_pdf[non_percentile_stats],
check_exact=False,
)

# 2. Check percentile stats (approximate percentiles use nearest interpolation).
quantile_pdf = (
pdf.groupby(groupby_col)[value_col]
.quantile(percentiles, interpolation="nearest")
.unstack(level=1)
.astype(float)
)
quantile_pdf.columns = ["{:.0%}".format(p) for p in percentiles]
self.assert_eq(
describe_psdf[formatted_percentiles],
quantile_pdf,
)

def test_series_groupby_describe(self):
# Basic numeric case
self._check_series_groupby_describe(
pd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6]}), "a", "b"
)

# Floats and negatives with larger groups
self._check_series_groupby_describe(
pd.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [-1.5, 2.0, 3.5, 10.0, 20.0, 30.0]}),
"a",
"b",
)

# Same-value groups: std should be 0.0, not NaN
pdf = pd.DataFrame({"a": [1, 1, 2], "b": [5, 5, 10]})
psdf = ps.from_pandas(pdf)
describe_psdf = psdf.groupby("a")["b"].describe().sort_index()
describe_pdf = pdf.groupby("a")["b"].describe().sort_index()
# Group a=1 has two identical values, so std must be 0.0
self.assertEqual(describe_psdf.loc[1, "std"], 0.0)
self.assert_eq(
describe_psdf[["count", "mean", "std", "min", "max"]],
describe_pdf[["count", "mean", "std", "min", "max"]],
check_exact=False,
)

# String group key with numeric values -- check both non-percentile and percentile stats
self._check_series_groupby_describe(
pd.DataFrame({"a": ["x", "x", "y"], "b": [4, 5, 6]}), "a", "b"
)

# String type series should raise NotImplementedError with a descriptive message
pdf = pd.DataFrame({"a": ["x", "x", "y"], "b": ["d", "e", "f"]})
psdf = ps.from_pandas(pdf)
self.assertRaisesRegex(
NotImplementedError,
"doesn't support for string type",
lambda: psdf.groupby("a")["b"].describe(),
)


class GroupbyDescribeTests(
GroupbyDescribeMixin,
Expand Down