Skip to content

DEPR: Remove datetime_is_numeric in describe #49368

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 31, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ Removal of prior version deprecations/changes
- Removed argument ``sort_columns`` in :meth:`DataFrame.plot` and :meth:`Series.plot` (:issue:`47563`)
- Removed argument ``is_copy`` from :meth:`DataFrame.take` and :meth:`Series.take` (:issue:`30615`)
- Removed argument ``kind`` from :meth:`Index.get_slice_bound`, :meth:`Index.slice_indexer` and :meth:`Index.slice_locs` (:issue:`41378`)
- Removed argument ``datetime_is_numeric`` from :meth:`DataFrame.describe` and :meth:`Series.describe` as datetime data will always be summarized as numeric data (:issue:`34798`)
- Disallow subclass-specific keywords (e.g. "freq", "tz", "names", "closed") in the :class:`Index` constructor (:issue:`38597`)
- Removed argument ``inplace`` from :meth:`Categorical.remove_unused_categories` (:issue:`37918`)
- Disallow passing non-round floats to :class:`Timestamp` with ``unit="M"`` or ``unit="Y"`` (:issue:`47266`)
Expand Down
39 changes: 5 additions & 34 deletions pandas/core/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
Sequence,
cast,
)
import warnings

import numpy as np

Expand All @@ -27,7 +26,6 @@
NDFrameT,
npt,
)
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_percentile

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -56,7 +54,6 @@ def describe_ndframe(
obj: NDFrameT,
include: str | Sequence[str] | None,
exclude: str | Sequence[str] | None,
datetime_is_numeric: bool,
percentiles: Sequence[float] | np.ndarray | None,
) -> NDFrameT:
"""Describe series or dataframe.
Expand All @@ -71,8 +68,6 @@ def describe_ndframe(
A white list of data types to include in the result. Ignored for ``Series``.
exclude : list-like of dtypes or None (default), optional,
A black list of data types to omit from the result. Ignored for ``Series``.
datetime_is_numeric : bool, default False
Whether to treat datetime dtypes as numeric.
percentiles : list-like of numbers, optional
The percentiles to include in the output. All should fall between 0 and 1.
The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
Expand All @@ -88,14 +83,12 @@ def describe_ndframe(
if obj.ndim == 1:
describer = SeriesDescriber(
obj=cast("Series", obj),
datetime_is_numeric=datetime_is_numeric,
)
else:
describer = DataFrameDescriber(
obj=cast("DataFrame", obj),
include=include,
exclude=exclude,
datetime_is_numeric=datetime_is_numeric,
)

result = describer.describe(percentiles=percentiles)
Expand All @@ -109,13 +102,10 @@ class NDFrameDescriberAbstract(ABC):
----------
obj : Series or DataFrame
Object to be described.
datetime_is_numeric : bool
Whether to treat datetime dtypes as numeric.
"""

def __init__(self, obj: DataFrame | Series, datetime_is_numeric: bool) -> None:
def __init__(self, obj: DataFrame | Series) -> None:
self.obj = obj
self.datetime_is_numeric = datetime_is_numeric

@abstractmethod
def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series:
Expand All @@ -136,7 +126,6 @@ class SeriesDescriber(NDFrameDescriberAbstract):
def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series:
describe_func = select_describe_func(
self.obj,
self.datetime_is_numeric,
)
return describe_func(self.obj, percentiles)

Expand All @@ -152,8 +141,6 @@ class DataFrameDescriber(NDFrameDescriberAbstract):
A white list of data types to include in the result.
exclude : list-like of dtypes or None
A black list of data types to omit from the result.
datetime_is_numeric : bool
Whether to treat datetime dtypes as numeric.
"""

def __init__(
Expand All @@ -162,22 +149,21 @@ def __init__(
*,
include: str | Sequence[str] | None,
exclude: str | Sequence[str] | None,
datetime_is_numeric: bool,
) -> None:
self.include = include
self.exclude = exclude

if obj.ndim == 2 and obj.columns.size == 0:
raise ValueError("Cannot describe a DataFrame without columns")

super().__init__(obj, datetime_is_numeric=datetime_is_numeric)
super().__init__(obj)

def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame:
data = self._select_data()

ldesc: list[Series] = []
for _, series in data.items():
describe_func = select_describe_func(series, self.datetime_is_numeric)
describe_func = select_describe_func(series)
ldesc.append(describe_func(series, percentiles))

col_names = reorder_columns(ldesc)
Expand All @@ -194,8 +180,7 @@ def _select_data(self):
if (self.include is None) and (self.exclude is None):
# when some numerics are found, keep only numerics
default_include: list[npt.DTypeLike] = [np.number]
if self.datetime_is_numeric:
default_include.append("datetime")
default_include.append("datetime")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could maybe now make this

default_include: list[npt.DTypeLike] = [np.number, "datetime"]

and lose the extra append step?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Yeah this is a good simplification

data = self.obj.select_dtypes(include=default_include)
if len(data.columns) == 0:
data = self.obj
Expand Down Expand Up @@ -360,34 +345,20 @@ def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series:

def select_describe_func(
data: Series,
datetime_is_numeric: bool,
) -> Callable:
"""Select proper function for describing series based on data type.

Parameters
----------
data : Series
Series to be described.
datetime_is_numeric : bool
Whether to treat datetime dtypes as numeric.
"""
if is_bool_dtype(data.dtype):
return describe_categorical_1d
elif is_numeric_dtype(data):
return describe_numeric_1d
elif is_datetime64_any_dtype(data.dtype):
if datetime_is_numeric:
return describe_timestamp_1d
else:
warnings.warn(
"Treating datetime data as categorical rather than numeric in "
"`.describe` is deprecated and will be removed in a future "
"version of pandas. Specify `datetime_is_numeric=True` to "
"silence this warning and adopt the future behavior now.",
FutureWarning,
stacklevel=find_stack_level(),
)
return describe_timestamp_as_categorical_1d
return describe_timestamp_1d
elif is_timedelta64_dtype(data.dtype):
return describe_numeric_1d
else:
Expand Down
10 changes: 1 addition & 9 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10588,7 +10588,6 @@ def describe(
percentiles=None,
include=None,
exclude=None,
datetime_is_numeric: bool_t = False,
) -> NDFrameT:
"""
Generate descriptive statistics.
Expand Down Expand Up @@ -10634,12 +10633,6 @@ def describe(
``select_dtypes`` (e.g. ``df.describe(exclude=['O'])``). To
exclude pandas categorical columns, use ``'category'``
- None (default) : The result will exclude nothing.
datetime_is_numeric : bool, default False
Whether to treat datetime dtypes as numeric. This affects statistics
calculated for the column. For DataFrame input, this also
controls whether datetime columns are included by default.

.. versionadded:: 1.1.0

Returns
-------
Expand Down Expand Up @@ -10717,7 +10710,7 @@ def describe(
... np.datetime64("2010-01-01"),
... np.datetime64("2010-01-01")
... ])
>>> s.describe(datetime_is_numeric=True)
>>> s.describe()
count 3
mean 2006-09-01 08:00:00
min 2000-01-01 00:00:00
Expand Down Expand Up @@ -10835,7 +10828,6 @@ def describe(
obj=self,
include=include,
exclude=exclude,
datetime_is_numeric=datetime_is_numeric,
percentiles=percentiles,
)

Expand Down
30 changes: 8 additions & 22 deletions pandas/tests/frame/methods/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,12 +274,12 @@ def test_describe_tz_values(self, tz_naive_fixture):
},
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
)
result = df.describe(include="all", datetime_is_numeric=True)
result = df.describe(include="all")
tm.assert_frame_equal(result, expected)

def test_datetime_is_numeric_includes_datetime(self):
df = DataFrame({"a": date_range("2012", periods=3), "b": [1, 2, 3]})
result = df.describe(datetime_is_numeric=True)
result = df.describe()
expected = DataFrame(
{
"a": [
Expand Down Expand Up @@ -307,36 +307,22 @@ def test_describe_tz_values2(self):
df = DataFrame({"s1": s1, "s2": s2})

s1_ = s1.describe()
s2_ = Series(
[
5,
5,
s2.value_counts().index[0],
1,
start.tz_localize(tz),
end.tz_localize(tz),
],
index=["count", "unique", "top", "freq", "first", "last"],
)
s2_ = s2.describe()
idx = [
"count",
"unique",
"top",
"freq",
"first",
"last",
"mean",
"std",
"min",
"25%",
"50%",
"75%",
"max",
"std",
]
expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).loc[idx]
expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).reindex(
idx, copy=False
)

with tm.assert_produces_warning(FutureWarning):
result = df.describe(include="all")
result = df.describe(include="all")
tm.assert_frame_equal(result, expected)

def test_describe_percentiles_integer_idx(self):
Expand Down
22 changes: 11 additions & 11 deletions pandas/tests/series/methods/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def test_describe_with_tz(self, tz_naive_fixture):
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s = Series(date_range(start, end, tz=tz), name=name)
result = s.describe(datetime_is_numeric=True)
result = s.describe()
expected = Series(
[
5,
Expand All @@ -115,32 +115,32 @@ def test_describe_with_tz(self, tz_naive_fixture):
)
tm.assert_series_equal(result, expected)

def test_describe_with_tz_warns(self):
def test_describe_with_tz_numeric(self):
name = tz = "CET"
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s = Series(date_range(start, end, tz=tz), name=name)

with tm.assert_produces_warning(FutureWarning):
result = s.describe()
result = s.describe()

expected = Series(
[
5,
5,
s.value_counts().index[0],
1,
start.tz_localize(tz),
end.tz_localize(tz),
Timestamp("2018-01-03 00:00:00", tz=tz),
Timestamp("2018-01-01 00:00:00", tz=tz),
Timestamp("2018-01-02 00:00:00", tz=tz),
Timestamp("2018-01-03 00:00:00", tz=tz),
Timestamp("2018-01-04 00:00:00", tz=tz),
Timestamp("2018-01-05 00:00:00", tz=tz),
],
name=name,
index=["count", "unique", "top", "freq", "first", "last"],
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
)
tm.assert_series_equal(result, expected)

def test_datetime_is_numeric_includes_datetime(self):
s = Series(date_range("2012", periods=3))
result = s.describe(datetime_is_numeric=True)
result = s.describe()
expected = Series(
[
3,
Expand Down