Skip to content

Commit c345ffd

Browse files
DEPR (string): non-bool na for obj.str.contains (#59615)
Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent 94a7c14 commit c345ffd

File tree

4 files changed

+81
-3
lines changed

4 files changed

+81
-3
lines changed

doc/source/whatsnew/v2.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ notable_bug_fix1
5353

5454
Deprecations
5555
~~~~~~~~~~~~
56-
-
56+
- Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`)
5757
-
5858

5959
.. ---------------------------------------------------------------------------

pandas/core/arrays/string_arrow.py

+10
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
TYPE_CHECKING,
77
Union,
88
)
9+
import warnings
910

1011
import numpy as np
1112

@@ -19,6 +20,7 @@
1920
pa_version_under10p1,
2021
pa_version_under13p0,
2122
)
23+
from pandas.util._exceptions import find_stack_level
2224

2325
from pandas.core.dtypes.common import (
2426
is_scalar,
@@ -297,6 +299,14 @@ def _str_contains(
297299
result = pc.match_substring(self._pa_array, pat, ignore_case=not case)
298300
result = self._convert_bool_result(result, na=na)
299301
if not isna(na):
302+
if not isinstance(na, bool):
303+
# GH#59561
304+
warnings.warn(
305+
"Allowing a non-bool 'na' in obj.str.contains is deprecated "
306+
"and will raise in a future version.",
307+
FutureWarning,
308+
stacklevel=find_stack_level(),
309+
)
300310
result[isna(result)] = bool(na)
301311
return result
302312

pandas/core/strings/object_array.py

+26
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,14 @@
99
cast,
1010
)
1111
import unicodedata
12+
import warnings
1213

1314
import numpy as np
1415

1516
from pandas._libs import lib
1617
import pandas._libs.missing as libmissing
1718
import pandas._libs.ops as libops
19+
from pandas.util._exceptions import find_stack_level
1820

1921
from pandas.core.dtypes.missing import isna
2022

@@ -142,14 +144,38 @@ def _str_contains(
142144
else:
143145
upper_pat = pat.upper()
144146
f = lambda x: upper_pat in x.upper()
147+
if not isna(na) and not isinstance(na, bool):
148+
# GH#59561
149+
warnings.warn(
150+
"Allowing a non-bool 'na' in obj.str.contains is deprecated "
151+
"and will raise in a future version.",
152+
FutureWarning,
153+
stacklevel=find_stack_level(),
154+
)
145155
return self._str_map(f, na, dtype=np.dtype("bool"))
146156

147157
def _str_startswith(self, pat, na=None):
148158
f = lambda x: x.startswith(pat)
159+
if not isna(na) and not isinstance(na, bool):
160+
# GH#59561
161+
warnings.warn(
162+
"Allowing a non-bool 'na' in obj.str.startswith is deprecated "
163+
"and will raise in a future version.",
164+
FutureWarning,
165+
stacklevel=find_stack_level(),
166+
)
149167
return self._str_map(f, na_value=na, dtype=np.dtype(bool))
150168

151169
def _str_endswith(self, pat, na=None):
152170
f = lambda x: x.endswith(pat)
171+
if not isna(na) and not isinstance(na, bool):
172+
# GH#59561
173+
warnings.warn(
174+
"Allowing a non-bool 'na' in obj.str.endswith is deprecated "
175+
"and will raise in a future version.",
176+
FutureWarning,
177+
stacklevel=find_stack_level(),
178+
)
153179
return self._str_map(f, na_value=na, dtype=np.dtype(bool))
154180

155181
def _str_replace(

pandas/tests/strings/test_find_replace.py

+44-2
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,16 @@ def test_contains_na_kwarg_for_nullable_string_dtype(
166166
# https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416
167167

168168
values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype)
169-
result = values.str.contains("a", na=na, regex=regex)
169+
170+
msg = (
171+
"Allowing a non-bool 'na' in obj.str.contains is deprecated and "
172+
"will raise in a future version"
173+
)
174+
warn = None
175+
if not pd.isna(na) and not isinstance(na, bool):
176+
warn = FutureWarning
177+
with tm.assert_produces_warning(warn, match=msg):
178+
result = values.str.contains("a", na=na, regex=regex)
170179
expected = Series([True, False, False, True, expected], dtype="boolean")
171180
tm.assert_series_equal(result, expected)
172181

@@ -232,7 +241,12 @@ def test_contains_nan(any_string_dtype):
232241
expected = Series([True, True, True], dtype=expected_dtype)
233242
tm.assert_series_equal(result, expected)
234243

235-
result = s.str.contains("foo", na="foo")
244+
msg = (
245+
"Allowing a non-bool 'na' in obj.str.contains is deprecated and "
246+
"will raise in a future version"
247+
)
248+
with tm.assert_produces_warning(FutureWarning, match=msg):
249+
result = s.str.contains("foo", na="foo")
236250
if any_string_dtype == "object":
237251
expected = Series(["foo", "foo", "foo"], dtype=np.object_)
238252
elif any_string_dtype.na_value is np.nan:
@@ -254,6 +268,34 @@ def test_contains_nan(any_string_dtype):
254268
# --------------------------------------------------------------------------------------
255269

256270

271+
def test_startswith_endswith_validate_na(any_string_dtype):
272+
# GH#59615
273+
ser = Series(
274+
["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"],
275+
dtype=any_string_dtype,
276+
)
277+
278+
dtype = ser.dtype
279+
if (
280+
isinstance(dtype, pd.StringDtype) and dtype.storage == "python"
281+
) or dtype == np.dtype("object"):
282+
msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated"
283+
with tm.assert_produces_warning(FutureWarning, match=msg):
284+
ser.str.startswith("kapow", na="baz")
285+
msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated"
286+
with tm.assert_produces_warning(FutureWarning, match=msg):
287+
ser.str.endswith("bar", na="baz")
288+
else:
289+
# TODO(infer_string): don't surface pyarrow errors
290+
import pyarrow as pa
291+
292+
msg = "Could not convert 'baz' with type str: tried to convert to boolean"
293+
with pytest.raises(pa.lib.ArrowInvalid, match=msg):
294+
ser.str.startswith("kapow", na="baz")
295+
with pytest.raises(pa.lib.ArrowInvalid, match=msg):
296+
ser.str.endswith("kapow", na="baz")
297+
298+
257299
@pytest.mark.parametrize("pat", ["foo", ("foo", "baz")])
258300
@pytest.mark.parametrize("dtype", ["object", "category"])
259301
@pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])

0 commit comments

Comments
 (0)