Skip to content

Commit 92bf98f

Browse files
[backport 2.3.x] BUG: fix .str.isdigit to honor unicode superscript for older pyarrow (#61962) (#62476)
1 parent e57c7d6 commit 92bf98f

File tree

3 files changed

+35
-5
lines changed

3 files changed

+35
-5
lines changed

doc/source/whatsnew/v2.3.2.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ become the default string dtype in pandas 3.0. See
2222

2323
Bug fixes
2424
^^^^^^^^^
25+
- Fix :meth:`~Series.str.isdigit` to correctly recognize unicode superscript
26+
characters as digits for :class:`StringDtype` backed by PyArrow (:issue:`61466`)
2527
- Fix :meth:`~DataFrame.to_json` with ``orient="table"`` to correctly use the
2628
"string" type in the JSON Table Schema for :class:`StringDtype` columns
2729
(:issue:`61889`)

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
pa_version_under11p0,
1717
pa_version_under13p0,
1818
pa_version_under17p0,
19+
pa_version_under21p0,
1920
)
2021

2122
if not pa_version_under10p1:
@@ -268,6 +269,12 @@ def _str_isdecimal(self):
268269
return self._convert_bool_result(result)
269270

270271
def _str_isdigit(self):
272+
if pa_version_under21p0:
273+
# https://github.com/pandas-dev/pandas/issues/61466
274+
res_list = self._apply_elementwise(str.isdigit)
275+
return self._convert_bool_result(
276+
pa.chunked_array(res_list, type=pa.bool_())
277+
)
271278
result = pc.utf8_is_digit(self._pa_array)
272279
return self._convert_bool_result(result)
273280

pandas/tests/strings/test_strings.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,15 @@
77
import numpy as np
88
import pytest
99

10+
from pandas.compat import pa_version_under21p0
11+
1012
from pandas import (
1113
NA,
1214
DataFrame,
1315
Index,
1416
MultiIndex,
1517
Series,
18+
StringDtype,
1619
)
1720
import pandas._testing as tm
1821
from pandas.core.strings.accessor import StringMethods
@@ -240,8 +243,9 @@ def test_ismethods(method, expected, any_string_dtype):
240243
@pytest.mark.parametrize(
241244
"method, expected",
242245
[
243-
("isnumeric", [False, True, True, False, True, True, False]),
244-
("isdecimal", [False, True, False, False, False, True, False]),
246+
("isnumeric", [False, True, True, True, False, True, True, False]),
247+
("isdecimal", [False, True, False, False, False, False, True, False]),
248+
("isdigit", [False, True, True, False, False, False, True, False]),
245249
],
246250
)
247251
def test_isnumeric_unicode(method, expected, any_string_dtype):
@@ -250,18 +254,35 @@ def test_isnumeric_unicode(method, expected, any_string_dtype):
250254
# 0x1378: ፸ ETHIOPIC NUMBER SEVENTY
251255
# 0xFF13: 3 Em 3 # noqa: RUF003
252256
ser = Series(
253-
["A", "3", "¼", "★", "፸", "3", "four"], dtype=any_string_dtype # noqa: RUF001
257+
["A", "3", "³", "¼", "★", "፸", "3", "four"], # noqa: RUF001
258+
dtype=any_string_dtype,
254259
)
255260
expected_dtype = (
256261
"bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
257262
)
258263
expected = Series(expected, dtype=expected_dtype)
264+
if (
265+
method == "isdigit"
266+
and isinstance(ser.dtype, StringDtype)
267+
and ser.dtype.storage == "pyarrow"
268+
and not pa_version_under21p0
269+
):
270+
# known difference in behavior between python and pyarrow unicode handling
271+
# pyarrow 21+ considers ¼ and ፸ as a digit, while python does not
272+
expected.iloc[3] = True
273+
expected.iloc[5] = True
274+
259275
result = getattr(ser.str, method)()
260276
tm.assert_series_equal(result, expected)
261277

262278
# compare with standard library
263-
expected = [getattr(item, method)() for item in ser]
264-
assert list(result) == expected
279+
# (only for non-pyarrow storage given the above differences)
280+
if any_string_dtype == "object" or (
281+
isinstance(any_string_dtype, StringDtype)
282+
and any_string_dtype.storage == "python"
283+
):
284+
expected = [getattr(item, method)() for item in ser]
285+
assert list(result) == expected
265286

266287

267288
@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning")

0 commit comments

Comments
 (0)