diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 0751554d87dc8..3e699e1a27b55 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -107,10 +107,10 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) +- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) - Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) -- Interval ^^^^^^^^ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7e8726f96f90a..e163a9df8ee10 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -726,20 +726,9 @@ def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: return arr, self.dtype.na_value - def __setitem__(self, key, value) -> None: - value = extract_array(value, extract_numpy=True) - if isinstance(value, type(self)): - # extract_array doesn't extract NumpyExtensionArray subclasses - value = value._ndarray - - key = check_array_indexer(self, key) - scalar_key = lib.is_scalar(key) - scalar_value = lib.is_scalar(value) - if scalar_key and not scalar_value: - raise ValueError("setting an array element with a sequence.") - - # validate new items - if scalar_value: + def _maybe_convert_setitem_value(self, value): + """Maybe convert value to be pyarrow compatible.""" + if lib.is_scalar(value): if isna(value): value = self.dtype.na_value elif not isinstance(value, str): @@ -749,8 +738,11 @@ def __setitem__(self, key, value) -> None: "instead." ) else: + value = extract_array(value, extract_numpy=True) if not is_array_like(value): value = np.asarray(value, dtype=object) + elif isinstance(value.dtype, type(self.dtype)): + return value else: # cast categories and friends to arrays to see if values are # compatible, compatibility with arrow backed strings @@ -760,11 +752,26 @@ def __setitem__(self, key, value) -> None: "Invalid value for dtype 'str'. Value should be a " "string or missing value (or array of those)." ) + return value - mask = isna(value) - if mask.any(): - value = value.copy() - value[isna(value)] = self.dtype.na_value + def __setitem__(self, key, value) -> None: + value = self._maybe_convert_setitem_value(value) + + key = check_array_indexer(self, key) + scalar_key = lib.is_scalar(key) + scalar_value = lib.is_scalar(value) + if scalar_key and not scalar_value: + raise ValueError("setting an array element with a sequence.") + + if not scalar_value: + if value.dtype == self.dtype: + value = value._ndarray + else: + value = np.asarray(value) + mask = isna(value) + if mask.any(): + value = value.copy() + value[isna(value)] = self.dtype.na_value super().__setitem__(key, value) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7a92b7306beea..1bc944935756e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1754,6 +1754,13 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: except (ValueError, TypeError): return False + if dtype == "string": + try: + arr._maybe_convert_setitem_value(element) # type: ignore[union-attr] + return True + except (ValueError, TypeError): + return False + # This is technically incorrect, but maintains the behavior of # ExtensionBlock._can_hold_element return True diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 917a65348b7a3..7ee1361912c05 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -84,6 +84,7 @@ ABCNumpyExtensionArray, ABCSeries, ) +from pandas.core.dtypes.inference import is_re from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, isna, @@ -115,6 +116,7 @@ PeriodArray, TimedeltaArray, ) +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.computation import expressions @@ -476,7 +478,9 @@ def split_and_operate(self, func, *args, **kwargs) -> list[Block]: # Up/Down-casting @final - def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: + def coerce_to_target_dtype( + self, other, warn_on_upcast: bool = False, using_cow: bool = False + ) -> Block: """ coerce the current block to a dtype compat for other we will return a block, possibly object, and not raise @@ -528,7 +532,14 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: f"{self.values.dtype}. Please report a bug at " "https://github.com/pandas-dev/pandas/issues." ) - return self.astype(new_dtype, copy=False) + copy = False + if ( + not using_cow + and isinstance(self.dtype, StringDtype) + and self.dtype.storage == "python" + ): + copy = True + return self.astype(new_dtype, copy=copy, using_cow=using_cow) @final def _maybe_downcast( @@ -879,7 +890,7 @@ def replace( else: return [self] if inplace else [self.copy()] - elif self._can_hold_element(value): + elif self._can_hold_element(value) or (self.dtype == "string" and is_re(value)): # TODO(CoW): Maybe split here as well into columns where mask has True # and rest? blk = self._maybe_copy(using_cow, inplace) @@ -926,12 +937,13 @@ def replace( if value is None or value is NA: blk = self.astype(np.dtype(object)) else: - blk = self.coerce_to_target_dtype(value) + blk = self.coerce_to_target_dtype(value, using_cow=using_cow) return blk.replace( to_replace=to_replace, value=value, inplace=True, mask=mask, + using_cow=using_cow, ) else: @@ -980,16 +992,26 @@ def _replace_regex( ------- List[Block] """ - if not self._can_hold_element(to_replace): + if not is_re(to_replace) and not self._can_hold_element(to_replace): # i.e. only if self.is_object is True, but could in principle include a # String ExtensionBlock if using_cow: return [self.copy(deep=False)] return [self] if inplace else [self.copy()] - rx = re.compile(to_replace) + if is_re(to_replace) and self.dtype not in [object, "string"]: + # only object or string dtype can hold strings, and a regex object + # will only match strings + return [self.copy(deep=False)] - block = self._maybe_copy(using_cow, inplace) + if not ( + self._can_hold_element(value) or (self.dtype == "string" and is_re(value)) + ): + block = self.astype(np.dtype(object)) + else: + block = self._maybe_copy(using_cow, inplace) + + rx = re.compile(to_replace) replace_regex(block.values, rx, value, mask) @@ -1048,7 +1070,9 @@ def replace_list( # Exclude anything that we know we won't contain pairs = [ - (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) + (x, y) + for x, y in zip(src_list, dest_list) + if (self._can_hold_element(x) or (self.dtype == "string" and is_re(x))) ] if not len(pairs): if using_cow: diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index ccee7ca24bd3d..8df9893e73766 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -297,7 +297,6 @@ def test_regex_replace_dict_nested_non_first_character( expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_dict_nested_gh4115(self): df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) @@ -556,7 +555,6 @@ def test_replace_series_dict(self): result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) @@ -932,7 +930,6 @@ def test_replace_input_formats_listlike(self): with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:]) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_input_formats_scalar(self): df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} @@ -961,7 +958,6 @@ def test_replace_limit(self): # TODO pass - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_dict_no_regex(self): answer = Series( { @@ -985,7 +981,6 @@ def test_replace_dict_no_regex(self): result = answer.replace(weights) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_series_no_regex(self): answer = Series( { @@ -1104,7 +1099,6 @@ def test_replace_swapping_bug(self, using_infer_string): expect = DataFrame({"a": ["Y", "N", "Y"]}) tm.assert_frame_equal(res, expect) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_period(self): d = { "fname": { @@ -1141,7 +1135,6 @@ def test_replace_period(self): result = df.replace(d) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_datetime(self): d = { "fname": { @@ -1367,7 +1360,6 @@ def test_replace_commutative(self, df, to_replace, exp): result = df.replace(to_replace) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize( "replacer", [ @@ -1644,7 +1636,6 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): # GH-48644 diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index d95ee99489076..a1263e2d30853 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -886,24 +886,16 @@ def test_index_where(self, obj, key, expected, warn, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if obj.dtype == "string" and not (isinstance(val, str) or isna(val)): - with pytest.raises(TypeError, match="Invalid value"): - Index(obj, dtype=obj.dtype).where(~mask, val) - else: - res = Index(obj, dtype=obj.dtype).where(~mask, val) - expected_idx = Index(expected, dtype=expected.dtype) - tm.assert_index_equal(res, expected_idx) + res = Index(obj, dtype=obj.dtype).where(~mask, val) + expected_idx = Index(expected, dtype=expected.dtype) + tm.assert_index_equal(res, expected_idx) def test_index_putmask(self, obj, key, expected, warn, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if obj.dtype == "string" and not (isinstance(val, str) or isna(val)): - with pytest.raises(TypeError, match="Invalid value"): - Index(obj, dtype=obj.dtype).putmask(mask, val) - else: - res = Index(obj, dtype=obj.dtype).putmask(mask, val) - tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) + res = Index(obj, dtype=obj.dtype).putmask(mask, val) + tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) @pytest.mark.parametrize( diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 79a66526a0004..0b0cf57a70c3f 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -403,10 +403,6 @@ def test_replace_categorical(self, categorical, numeric, using_infer_string): ser = pd.Series(categorical) msg = "Downcasting behavior in `replace`" msg = "with CategoricalDtype is deprecated" - if using_infer_string: - with pytest.raises(TypeError, match="Invalid value"): - ser.replace({"A": 1, "B": 2}) - return with tm.assert_produces_warning(FutureWarning, match=msg): result = ser.replace({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") @@ -745,13 +741,13 @@ def test_replace_regex_dtype_series(self, regex): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("regex", [False, True]) - def test_replace_regex_dtype_series_string(self, regex, using_infer_string): - if not using_infer_string: - # then this is object dtype which is already tested above - return + def test_replace_regex_dtype_series_string(self, regex): series = pd.Series(["0"], dtype="str") - with pytest.raises(TypeError, match="Invalid value"): - series.replace(to_replace="0", value=1, regex=regex) + expected = pd.Series([1], dtype="int64") + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = series.replace(to_replace="0", value=1, regex=regex) + tm.assert_series_equal(result, expected) def test_replace_different_int_types(self, any_int_numpy_dtype): # GH#45311