Skip to content

Commit cc957d1

Browse files
authored
PERF/ENH: add fast astyping for Categorical (#37355)
1 parent 085a22d commit cc957d1

File tree

5 files changed

+79
-13
lines changed

5 files changed

+79
-13
lines changed

asv_bench/benchmarks/categoricals.py

+43
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import string
2+
import sys
13
import warnings
24

35
import numpy as np
@@ -67,6 +69,47 @@ def time_existing_series(self):
6769
pd.Categorical(self.series)
6870

6971

72+
class AsType:
73+
def setup(self):
74+
N = 10 ** 5
75+
76+
random_pick = np.random.default_rng().choice
77+
78+
categories = {
79+
"str": list(string.ascii_letters),
80+
"int": np.random.randint(2 ** 16, size=154),
81+
"float": sys.maxsize * np.random.random((38,)),
82+
"timestamp": [
83+
pd.Timestamp(x, unit="s") for x in np.random.randint(2 ** 18, size=578)
84+
],
85+
}
86+
87+
self.df = pd.DataFrame(
88+
{col: random_pick(cats, N) for col, cats in categories.items()}
89+
)
90+
91+
for col in ("int", "float", "timestamp"):
92+
self.df[col + "_as_str"] = self.df[col].astype(str)
93+
94+
for col in self.df.columns:
95+
self.df[col] = self.df[col].astype("category")
96+
97+
def astype_str(self):
98+
[self.df[col].astype("str") for col in "int float timestamp".split()]
99+
100+
def astype_int(self):
101+
[self.df[col].astype("int") for col in "int_as_str timestamp".split()]
102+
103+
def astype_float(self):
104+
[
105+
self.df[col].astype("float")
106+
for col in "float_as_str int int_as_str timestamp".split()
107+
]
108+
109+
def astype_datetime(self):
110+
self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific"))
111+
112+
70113
class Concat:
71114
def setup(self):
72115
N = 10 ** 5

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,7 @@ Performance improvements
499499
- Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`)
500500
- faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`)
501501
- Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
502+
- Performance improvement in :meth:`Series.astype` and :meth:`DataFrame.astype` for :class:`Categorical` (:issue:`8628`)
502503
- Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`)
503504
- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements
504505

pandas/core/arrays/categorical.py

+31-9
Original file line numberDiff line numberDiff line change
@@ -403,20 +403,42 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike:
403403
If copy is set to False and dtype is categorical, the original
404404
object is returned.
405405
"""
406-
if is_categorical_dtype(dtype):
406+
if self.dtype is dtype:
407+
result = self.copy() if copy else self
408+
409+
elif is_categorical_dtype(dtype):
407410
dtype = cast(Union[str, CategoricalDtype], dtype)
408411

409412
# GH 10696/18593/18630
410413
dtype = self.dtype.update_dtype(dtype)
411-
result = self.copy() if copy else self
412-
if dtype == self.dtype:
413-
return result
414-
return result._set_dtype(dtype)
415-
if is_extension_array_dtype(dtype):
416-
return array(self, dtype=dtype, copy=copy)
417-
if is_integer_dtype(dtype) and self.isna().any():
414+
self = self.copy() if copy else self
415+
result = self._set_dtype(dtype)
416+
417+
# TODO: consolidate with ndarray case?
418+
elif is_extension_array_dtype(dtype):
419+
result = array(self, dtype=dtype, copy=copy)
420+
421+
elif is_integer_dtype(dtype) and self.isna().any():
418422
raise ValueError("Cannot convert float NaN to integer")
419-
return np.array(self, dtype=dtype, copy=copy)
423+
424+
elif len(self.codes) == 0 or len(self.categories) == 0:
425+
result = np.array(self, dtype=dtype, copy=copy)
426+
427+
else:
428+
# GH8628 (PERF): astype category codes instead of astyping array
429+
try:
430+
astyped_cats = self.categories.astype(dtype=dtype, copy=copy)
431+
except (
432+
TypeError, # downstream error msg for CategoricalIndex is misleading
433+
ValueError,
434+
):
435+
msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"
436+
raise ValueError(msg)
437+
438+
astyped_cats = extract_array(astyped_cats, extract_numpy=True)
439+
result = take_1d(astyped_cats, libalgos.ensure_platform_int(self._codes))
440+
441+
return result
420442

421443
@cache_readonly
422444
def itemsize(self) -> int:

pandas/tests/arrays/categorical/test_dtypes.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def test_astype(self, ordered):
127127
expected = np.array(cat)
128128
tm.assert_numpy_array_equal(result, expected)
129129

130-
msg = "could not convert string to float"
130+
msg = r"Cannot cast object dtype to <class 'float'>"
131131
with pytest.raises(ValueError, match=msg):
132132
cat.astype(float)
133133

@@ -138,7 +138,7 @@ def test_astype(self, ordered):
138138
tm.assert_numpy_array_equal(result, expected)
139139

140140
result = cat.astype(int)
141-
expected = np.array(cat, dtype=int)
141+
expected = np.array(cat, dtype="int64")
142142
tm.assert_numpy_array_equal(result, expected)
143143

144144
result = cat.astype(float)

pandas/tests/series/test_dtypes.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -60,15 +60,15 @@ def test_astype_categorical_to_other(self):
6060
expected = ser
6161
tm.assert_series_equal(ser.astype("category"), expected)
6262
tm.assert_series_equal(ser.astype(CategoricalDtype()), expected)
63-
msg = r"could not convert string to float|invalid literal for float\(\)"
63+
msg = r"Cannot cast object dtype to float64"
6464
with pytest.raises(ValueError, match=msg):
6565
ser.astype("float64")
6666

6767
cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
6868
exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"])
6969
tm.assert_series_equal(cat.astype("str"), exp)
7070
s2 = Series(Categorical(["1", "2", "3", "4"]))
71-
exp2 = Series([1, 2, 3, 4]).astype(int)
71+
exp2 = Series([1, 2, 3, 4]).astype("int64")
7272
tm.assert_series_equal(s2.astype("int"), exp2)
7373

7474
# object don't sort correctly, so just compare that we have the same

0 commit comments

Comments
 (0)