From faed77bfacf243883f25c6e52f06d3961c0ff7f5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 22 Aug 2024 11:43:57 +0200 Subject: [PATCH 01/34] String dtype: fix pyarrow-based IO + update tests (#59478) --- pandas/io/_util.py | 2 + pandas/tests/io/test_feather.py | 27 +++++++++----- pandas/tests/io/test_fsspec.py | 6 +-- pandas/tests/io/test_gcs.py | 2 +- pandas/tests/io/test_orc.py | 25 +++++++------ pandas/tests/io/test_parquet.py | 65 ++++++++++++++++++++++----------- 6 files changed, 80 insertions(+), 47 deletions(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 68fcfcf65e0c2..50a97f1059b5c 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -24,6 +24,8 @@ def _arrow_dtype_mapping() -> dict: pa.string(): pd.StringDtype(), pa.float32(): pd.Float32Dtype(), pa.float64(): pd.Float64Dtype(), + pa.string(): pd.StringDtype(), + pa.large_string(): pd.StringDtype(), } diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index d1201686edefa..57e12747a3746 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -9,12 +9,10 @@ from pandas.io.feather_format import read_feather, to_feather # isort:skip -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + pa = pytest.importorskip("pyarrow") @@ -154,8 +152,8 @@ def test_path_localpath(self): def test_passthrough_keywords(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() self.check_round_trip(df, write_kwargs={"version": 1}) @@ -169,7 +167,9 @@ def test_http_path(self, feather_file, httpserver): res = read_feather(httpserver.url) tm.assert_frame_equal(expected, res) - def test_read_feather_dtype_backend(self, string_storage, dtype_backend): + def test_read_feather_dtype_backend( + self, string_storage, dtype_backend, using_infer_string + ): # GH#50765 df = pd.DataFrame( { @@ -191,7 +191,10 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - string_dtype = pd.ArrowDtype(pa.string()) + if using_infer_string: + string_dtype = pd.ArrowDtype(pa.large_string()) + else: + string_dtype = pd.ArrowDtype(pa.string()) else: string_dtype = pd.StringDtype(string_storage) @@ -218,6 +221,10 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): } ) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) tm.assert_frame_equal(result, expected) def test_int_columns_and_index(self): diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 19b60e17d3a92..5ed64e3eb0958 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -168,7 +168,7 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -198,7 +198,7 @@ def test_arrowparquet_options(fsspectest): @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_fastparquet_options(fsspectest): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -256,7 +256,7 @@ def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") @pytest.mark.single_cpu @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet def test_s3_parquet(s3_public_bucket, s3so, df1): diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 96bc0326b23ab..81f951b3958b0 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -197,7 +197,7 @@ def test_to_csv_compression_encoding_gcs( tm.assert_frame_equal(df, read_df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 52d6850483418..d2204a9134f90 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import read_orc import pandas._testing as tm @@ -19,12 +17,9 @@ import pyarrow as pa -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) @pytest.fixture @@ -47,7 +42,7 @@ def orc_writer_dtypes_not_supported(request): return pd.DataFrame({"unimpl": request.param}) -def test_orc_reader_empty(dirpath): +def test_orc_reader_empty(dirpath, using_infer_string): columns = [ "boolean1", "byte1", @@ -68,11 +63,12 @@ def test_orc_reader_empty(dirpath): "float32", "float64", "object", - "object", + "str" if using_infer_string else "object", ] expected = pd.DataFrame(index=pd.RangeIndex(0)) for colname, dtype in zip(columns, dtypes): expected[colname] = pd.Series(dtype=dtype) + expected.columns = expected.columns.astype("str") inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") got = read_orc(inputfile, columns=columns) @@ -309,7 +305,7 @@ def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported): orc_writer_dtypes_not_supported.to_orc() -def test_orc_dtype_backend_pyarrow(): +def test_orc_dtype_backend_pyarrow(using_infer_string): pytest.importorskip("pyarrow") df = pd.DataFrame( { @@ -340,6 +336,13 @@ def test_orc_dtype_backend_pyarrow(): for col in df.columns } ) + if using_infer_string: + # ORC does not preserve distinction between string and large string + # -> the default large string comes back as string + string_dtype = pd.ArrowDtype(pa.string()) + expected["string"] = expected["string"].astype(string_dtype) + expected["string_with_nan"] = expected["string_with_nan"].astype(string_dtype) + expected["string_with_none"] = expected["string_with_none"].astype(string_dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 59662ec77d52f..578c0949a6c97 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -55,7 +55,6 @@ pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] @@ -64,11 +63,18 @@ params=[ pytest.param( "fastparquet", - marks=pytest.mark.skipif( - not _HAVE_FASTPARQUET - or _get_option("mode.data_manager", silent=True) == "array", - reason="fastparquet is not installed or ArrayManager is used", - ), + marks=[ + pytest.mark.skipif( + not _HAVE_FASTPARQUET + or _get_option("mode.data_manager", silent=True) == "array", + reason="fastparquet is not installed or ArrayManager is used", + ), + pytest.mark.xfail( + using_string_dtype(), + reason="TODO(infer_string) fastparquet", + strict=False, + ), + ], ), pytest.param( "pyarrow", @@ -90,17 +96,24 @@ def pa(): @pytest.fixture -def fp(): +def fp(request): if not _HAVE_FASTPARQUET: pytest.skip("fastparquet is not installed") elif _get_option("mode.data_manager", silent=True) == "array": pytest.skip("ArrayManager is not supported with fastparquet") + if using_string_dtype(): + request.applymarker( + pytest.mark.xfail(reason="TODO(infer_string) fastparquet", strict=False) + ) return "fastparquet" @pytest.fixture def df_compat(): - return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}) + # TODO(infer_string) should this give str columns? + return pd.DataFrame( + {"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"], dtype=object) + ) @pytest.fixture @@ -389,16 +402,6 @@ def check_external_error_on_write(self, df, engine, exc): with tm.external_error_raised(exc): to_parquet(df, path, engine, compression=None) - @pytest.mark.network - @pytest.mark.single_cpu - def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): - if engine != "auto": - pytest.importorskip(engine) - with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: - httpserver.serve_content(content=f.read()) - df = read_parquet(httpserver.url) - tm.assert_frame_equal(df, df_compat) - class TestBasic(Base): def test_error(self, engine): @@ -696,6 +699,16 @@ def test_read_empty_array(self, pa, dtype): df, pa, read_kwargs={"dtype_backend": "numpy_nullable"}, expected=expected ) + @pytest.mark.network + @pytest.mark.single_cpu + def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): + if engine != "auto": + pytest.importorskip(engine) + with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: + httpserver.serve_content(content=f.read()) + df = read_parquet(httpserver.url, engine=engine) + tm.assert_frame_equal(df, df_compat) + class TestParquetPyArrow(Base): def test_basic(self, pa, df_full): @@ -925,7 +938,7 @@ def test_write_with_schema(self, pa): out_df = df.astype(bool) check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df) - def test_additional_extension_arrays(self, pa): + def test_additional_extension_arrays(self, pa, using_infer_string): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol pytest.importorskip("pyarrow") @@ -936,17 +949,25 @@ def test_additional_extension_arrays(self, pa): "c": pd.Series(["a", None, "c"], dtype="string"), } ) - check_round_trip(df, pa) + if using_infer_string: + check_round_trip(df, pa, expected=df.astype({"c": "str"})) + else: + check_round_trip(df, pa) df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) check_round_trip(df, pa) - def test_pyarrow_backed_string_array(self, pa, string_storage): + def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_string): # test ArrowStringArray supported through the __arrow_array__ protocol pytest.importorskip("pyarrow") df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) with pd.option_context("string_storage", string_storage): - check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]")) + if using_infer_string: + expected = df.astype("str") + expected.columns = expected.columns.astype("str") + else: + expected = df.astype(f"string[{string_storage}]") + check_round_trip(df, pa, expected=expected) def test_additional_extension_types(self, pa): # test additional ExtensionArrays that are supported through the From ac9bf9c1bf6e50b7d799472b22995ad6d7a0fcae Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 22 Aug 2024 13:55:02 -0700 Subject: [PATCH 02/34] REF (string): avoid copy in StringArray factorize (#59551) * REF: avoid copy in StringArray factorize * mypy fixup * un-xfail --- pandas/_libs/arrays.pyx | 4 ++++ pandas/_libs/hashtable.pyx | 5 ++++- pandas/_libs/hashtable_class_helper.pxi.in | 18 +++++++++++++++--- pandas/core/arrays/_mixins.py | 19 ++++++++----------- pandas/core/arrays/categorical.py | 5 ----- pandas/core/arrays/numpy_.py | 3 --- pandas/core/arrays/string_.py | 12 +++--------- pandas/tests/groupby/test_groupby_dropna.py | 3 --- pandas/tests/window/test_rolling.py | 6 ------ 9 files changed, 34 insertions(+), 41 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 9889436a542c1..2932f3ff56396 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -67,6 +67,10 @@ cdef class NDArrayBacked: """ Construct a new ExtensionArray `new_array` with `arr` as its _ndarray. + The returned array has the same dtype as self. + + Caller is responsible for ensuring `values.dtype == self._ndarray.dtype`. + This should round-trip: self == self._from_backing_data(self._ndarray) """ diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index ccac3d0b50d45..127b0b845d219 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -33,7 +33,10 @@ from pandas._libs.khash cimport ( kh_python_hash_func, khiter_t, ) -from pandas._libs.missing cimport checknull +from pandas._libs.missing cimport ( + checknull, + is_matching_na, +) def get_hashtable_trace_domain(): diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c0723392496c1..c42bccb7f38f7 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1121,11 +1121,13 @@ cdef class StringHashTable(HashTable): const char **vecs khiter_t k bint use_na_value + bint non_null_na_value if return_inverse: labels = np.zeros(n, dtype=np.intp) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None + non_null_na_value = not checknull(na_value) # assign pointers and pre-filter out missing (if ignore_na) vecs = malloc(n * sizeof(char *)) @@ -1134,7 +1136,12 @@ cdef class StringHashTable(HashTable): if (ignore_na and (not isinstance(val, str) - or (use_na_value and val == na_value))): + or (use_na_value and ( + (non_null_na_value and val == na_value) or + (not non_null_na_value and is_matching_na(val, na_value))) + ) + ) + ): # if missing values do not count as unique values (i.e. if # ignore_na is True), we can skip the actual value, and # replace the label with na_sentinel directly @@ -1400,10 +1407,11 @@ cdef class PyObjectHashTable(HashTable): object val khiter_t k bint use_na_value - + bint non_null_na_value if return_inverse: labels = np.empty(n, dtype=np.intp) use_na_value = na_value is not None + non_null_na_value = not checknull(na_value) for i in range(n): val = values[i] @@ -1411,7 +1419,11 @@ cdef class PyObjectHashTable(HashTable): if ignore_na and ( checknull(val) - or (use_na_value and val == na_value) + or (use_na_value and ( + (non_null_na_value and val == na_value) or + (not non_null_na_value and is_matching_na(val, na_value)) + ) + ) ): # if missing values do not count as unique values (i.e. if # ignore_na is True), skip the hashtable entry for them, and diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 0da121c36644a..cb6861a8dd00f 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -515,17 +515,14 @@ def _quantile( fill_value = self._internal_fill_value res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation) - - res_values = self._cast_quantile_result(res_values) - return self._from_backing_data(res_values) - - # TODO: see if we can share this with other dispatch-wrapping methods - def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: - """ - Cast the result of quantile_with_mask to an appropriate dtype - to pass to _from_backing_data in _quantile. - """ - return res_values + if res_values.dtype == self._ndarray.dtype: + return self._from_backing_data(res_values) + else: + # e.g. test_quantile_empty we are empty integer dtype and res_values + # has floating dtype + # TODO: technically __init__ isn't defined here. + # Should we raise NotImplementedError and handle this on NumpyEA? + return type(self)(res_values) # type: ignore[call-arg] # ------------------------------------------------------------------------ # numpy-like methods diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f191f7277743f..6ffc0df243130 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2475,11 +2475,6 @@ def unique(self) -> Self: # pylint: disable=useless-parent-delegation return super().unique() - def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: - # make sure we have correct itemsize for resulting codes - assert res_values.dtype == self._ndarray.dtype - return res_values - def equals(self, other: object) -> bool: """ Returns True if categorical arrays are equal. diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 03712f75db0c7..aafcd82114b97 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -137,9 +137,6 @@ def _from_sequence( result = result.copy() return cls(result) - def _from_backing_data(self, arr: np.ndarray) -> NumpyExtensionArray: - return type(self)(arr) - # ------------------------------------------------------------------------ # Data diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 1aa6fb70d250c..fa1e5e605e16e 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -657,11 +657,10 @@ def __arrow_array__(self, type=None): values[self.isna()] = None return pa.array(values, type=type, from_pandas=True) - def _values_for_factorize(self): + def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: # type: ignore[override] arr = self._ndarray.copy() - mask = self.isna() - arr[mask] = None - return arr, None + + return arr, self.dtype.na_value def __setitem__(self, key, value) -> None: value = extract_array(value, extract_numpy=True) @@ -871,8 +870,3 @@ def _from_sequence( if dtype is None: dtype = StringDtype(storage="python", na_value=np.nan) return super()._from_sequence(scalars, dtype=dtype, copy=copy) - - def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics: - # need to override NumpyExtensionArray._from_backing_data to ensure - # we always preserve the dtype - return NDArrayBacked._from_backing_data(self, arr) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d843a992daee0..3856a0d8928a7 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -388,9 +388,6 @@ def test_groupby_dropna_with_multiindex_input(input_index, keys, series): tm.assert_equal(result, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) def test_groupby_nan_included(): # GH 35646 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index acf636616421f..f353a7fa2f0fe 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -6,10 +6,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( - HAS_PYARROW, IS64, is_platform_arm, is_platform_power, @@ -1423,9 +1420,6 @@ def test_rolling_corr_timedelta_index(index, window): tm.assert_almost_equal(result, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) def test_groupby_rolling_nan_included(): # GH 35542 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} From 8386795de412f581c228a5143eb2e06da2704b38 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 27 Aug 2024 17:51:42 +0200 Subject: [PATCH 03/34] String dtype: avoid surfacing pyarrow exception in binary operations (#59610) --- pandas/core/arrays/arrow/array.py | 40 ++++++++++++++--- pandas/core/arrays/string_.py | 5 ++- pandas/tests/arithmetic/test_object.py | 25 +++-------- .../tests/arrays/boolean/test_arithmetic.py | 26 +++-------- .../tests/arrays/floating/test_arithmetic.py | 23 ++++------ .../tests/arrays/integer/test_arithmetic.py | 34 +++++--------- pandas/tests/extension/base/ops.py | 10 +---- .../tests/extension/decimal/test_decimal.py | 2 +- pandas/tests/extension/test_arrow.py | 38 +++------------- pandas/tests/extension/test_string.py | 32 ++++--------- pandas/tests/frame/test_logical_ops.py | 20 +++------ pandas/tests/frame/test_unary.py | 26 +++-------- pandas/tests/indexes/object/test_indexing.py | 45 ++++++------------- pandas/tests/indexes/test_old_base.py | 14 ++---- pandas/tests/series/test_arithmetic.py | 26 +++-------- pandas/tests/series/test_logical_ops.py | 36 +++++---------- 16 files changed, 129 insertions(+), 273 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 46f2cbb2ebeef..5f8963c81b0ba 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -676,7 +676,12 @@ def __invert__(self) -> Self: return type(self)(pc.invert(self._pa_array)) def __neg__(self) -> Self: - return type(self)(pc.negate_checked(self._pa_array)) + try: + return type(self)(pc.negate_checked(self._pa_array)) + except pa.ArrowNotImplementedError as err: + raise TypeError( + f"unary '-' not supported for dtype '{self.dtype}'" + ) from err def __pos__(self) -> Self: return type(self)(self._pa_array) @@ -731,8 +736,19 @@ def _cmp_method(self, other, op): ) return ArrowExtensionArray(result) - def _evaluate_op_method(self, other, op, arrow_funcs): + def _op_method_error_message(self, other, op) -> str: + if hasattr(other, "dtype"): + other_type = f"dtype '{other.dtype}'" + else: + other_type = f"object of type {type(other)}" + return ( + f"operation '{op.__name__}' not supported for " + f"dtype '{self.dtype}' with {other_type}" + ) + + def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: pa_type = self._pa_array.type + other_original = other other = self._box_pa(other) if ( @@ -742,10 +758,15 @@ def _evaluate_op_method(self, other, op, arrow_funcs): ): if op in [operator.add, roperator.radd]: sep = pa.scalar("", type=pa_type) - if op is operator.add: - result = pc.binary_join_element_wise(self._pa_array, other, sep) - elif op is roperator.radd: - result = pc.binary_join_element_wise(other, self._pa_array, sep) + try: + if op is operator.add: + result = pc.binary_join_element_wise(self._pa_array, other, sep) + elif op is roperator.radd: + result = pc.binary_join_element_wise(other, self._pa_array, sep) + except pa.ArrowNotImplementedError as err: + raise TypeError( + self._op_method_error_message(other_original, op) + ) from err return type(self)(result) elif op in [operator.mul, roperator.rmul]: binary = self._pa_array @@ -777,9 +798,14 @@ def _evaluate_op_method(self, other, op, arrow_funcs): pc_func = arrow_funcs[op.__name__] if pc_func is NotImplemented: + if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type): + raise TypeError(self._op_method_error_message(other_original, op)) raise NotImplementedError(f"{op.__name__} not implemented.") - result = pc_func(self._pa_array, other) + try: + result = pc_func(self._pa_array, other) + except pa.ArrowNotImplementedError as err: + raise TypeError(self._op_method_error_message(other_original, op)) from err return type(self)(result) def _logical_method(self, other, op): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index fa1e5e605e16e..c04ec13dbd81c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -823,8 +823,11 @@ def _cmp_method(self, other, op): f"Lengths of operands do not match: {len(self)} != {len(other)}" ) - other = np.asarray(other) + # for array-likes, first filter out NAs before converting to numpy + if not is_array_like(other): + other = np.asarray(other) other = other[valid] + other = np.asarray(other) if op.__name__ in ops.ARITHMETIC_BINOPS: result = np.empty_like(self._ndarray, dtype="object") diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 899ea1910d055..bc0f78d3aa01a 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,9 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -318,27 +315,17 @@ def test_add(self): expected = pd.Index(["1a", "1b", "1c"]) tm.assert_index_equal("1" + index, expected) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) - def test_sub_fail(self, using_infer_string): + def test_sub_fail(self): index = pd.Index([str(i) for i in range(10)]) - if using_infer_string: - import pyarrow as pa - - err = pa.lib.ArrowNotImplementedError - msg = "has no kernel" - else: - err = TypeError - msg = "unsupported operand type|Cannot broadcast" - with pytest.raises(err, match=msg): + msg = "unsupported operand type|Cannot broadcast|sub' not supported" + with pytest.raises(TypeError, match=msg): index - "a" - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): index - index - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): index - index.tolist() - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): index.tolist() - index def test_sub_object(self): diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 4dbd8eb9f5ca7..9ff690cdc914d 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -3,10 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - import pandas as pd import pandas._testing as tm @@ -94,19 +90,8 @@ def test_op_int8(left_array, right_array, opname): # ----------------------------------------------------------------------------- -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) -def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): +def test_error_invalid_values(data, all_arithmetic_operators): # invalid ops - - if using_infer_string: - import pyarrow as pa - - err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) - else: - err = TypeError - op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) @@ -116,7 +101,8 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "did not contain a loop with signature matching types|" "BooleanArray cannot perform the operation|" "not supported for the input types, and the inputs could not be safely coerced " - "to any supported types according to the casting rule ''safe''" + "to any supported types according to the casting rule ''safe''|" + "not supported for dtype" ) with pytest.raises(TypeError, match=msg): ops("foo") @@ -125,9 +111,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string r"unsupported operand type\(s\) for", "Concatenation operation is not implemented for NumPy arrays", "has no kernel", + "not supported for dtype", ] ) - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes @@ -140,7 +127,8 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "not all arguments converted during string formatting", "has no kernel", "not implemented", + "not supported for dtype", ] ) - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Series("foo", index=s.index)) diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 768d3c1449fa4..009fac4c2f5ed 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.core.arrays import FloatingArray @@ -124,19 +122,11 @@ def test_arith_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): +def test_error_invalid_values(data, all_arithmetic_operators): op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) - if using_infer_string: - import pyarrow as pa - - errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) - else: - errs = TypeError - # invalid scalars msg = "|".join( [ @@ -152,15 +142,17 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "Concatenation operation is not implemented for NumPy arrays", "has no kernel", "not implemented", + "not supported for dtype", + "Can only string multiply by an integer", ] ) - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops("foo") - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Series("foo", index=s.index)) msg = "|".join( @@ -181,9 +173,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "cannot subtract DatetimeArray from ndarray", "has no kernel", "not implemented", + "not supported for dtype", ] ) - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index 8aa8c2db940b4..dee3deeee0f2f 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.core import ops @@ -174,19 +172,11 @@ def test_numpy_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): +def test_error_invalid_values(data, all_arithmetic_operators): op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) - if using_infer_string: - import pyarrow as pa - - errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) - else: - errs = TypeError - # invalid scalars msg = "|".join( [ @@ -201,24 +191,21 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "has no kernel", "not implemented", "The 'out' kwarg is necessary. Use numpy.strings.multiply without it.", + "not supported for dtype", ] ) - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops("foo") - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes str_ser = pd.Series("foo", index=s.index) # with pytest.raises(TypeError, match=msg): - if ( - all_arithmetic_operators - in [ - "__mul__", - "__rmul__", - ] - and not using_infer_string - ): # (data[~data.isna()] >= 0).all(): + if all_arithmetic_operators in [ + "__mul__", + "__rmul__", + ]: # (data[~data.isna()] >= 0).all(): res = ops(str_ser) expected = pd.Series(["foo" * x for x in data], index=s.index) expected = expected.fillna(np.nan) @@ -227,7 +214,7 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string # more-correct than np.nan here. tm.assert_series_equal(res, expected) else: - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(str_ser) msg = "|".join( @@ -242,9 +229,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "cannot subtract DatetimeArray from ndarray", "has no kernel", "not implemented", + "not supported for dtype", ] ) - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index ff9f3cbed64a2..547114ecfddd0 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -24,7 +24,7 @@ class BaseOpsUtil: def _get_expected_exception( self, op_name: str, obj, other - ) -> type[Exception] | None: + ) -> type[Exception] | tuple[type[Exception], ...] | None: # Find the Exception, if any we expect to raise calling # obj.__op_name__(other) @@ -39,14 +39,6 @@ def _get_expected_exception( else: result = self.frame_scalar_exc - if using_string_dtype() and result is not None: - import pyarrow as pa - - result = ( # type: ignore[assignment] - result, - pa.lib.ArrowNotImplementedError, - NotImplementedError, - ) return result def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 9907e345ada63..8afb989508e04 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -68,7 +68,7 @@ def data_for_grouping(): class TestDecimalArray(base.ExtensionTests): def _get_expected_exception( self, op_name: str, obj, other - ) -> type[Exception] | None: + ) -> type[Exception] | tuple[type[Exception], ...] | None: return None def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d9a3033b8380e..0e8e1809d08ac 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -800,8 +800,6 @@ def test_value_counts_returns_pyarrow_int64(self, data): _combine_le_expected_dtype = "bool[pyarrow]" - divmod_exc = NotImplementedError - def get_op_from_name(self, op_name): short_opname = op_name.strip("_") if short_opname == "rtruediv": @@ -935,10 +933,11 @@ def _is_temporal_supported(self, opname, pa_dtype): def _get_expected_exception( self, op_name: str, obj, other - ) -> type[Exception] | None: + ) -> type[Exception] | tuple[type[Exception], ...] | None: if op_name in ("__divmod__", "__rdivmod__"): - return self.divmod_exc + return (NotImplementedError, TypeError) + exc: type[Exception] | tuple[type[Exception], ...] | None dtype = tm.get_dtype(obj) # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no # attribute "pyarrow_dtype" @@ -949,7 +948,7 @@ def _get_expected_exception( "__mod__", "__rmod__", }: - exc = NotImplementedError + exc = (NotImplementedError, TypeError) elif arrow_temporal_supported: exc = None elif op_name in ["__add__", "__radd__"] and ( @@ -961,10 +960,7 @@ def _get_expected_exception( or pa.types.is_integer(pa_dtype) or pa.types.is_decimal(pa_dtype) ): - # TODO: in many of these cases, e.g. non-duration temporal, - # these will *never* be allowed. Would it make more sense to - # re-raise as TypeError, more consistent with non-pyarrow cases? - exc = pa.ArrowNotImplementedError + exc = TypeError else: exc = None return exc @@ -1020,14 +1016,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request) if all_arithmetic_operators == "__rmod__" and pa.types.is_binary(pa_dtype): pytest.skip("Skip testing Python string formatting") - elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( - pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) - ): - request.applymarker( - pytest.mark.xfail( - raises=TypeError, reason="Can only string multiply by an integer." - ) - ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: @@ -1042,14 +1030,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) ): pytest.skip("Skip testing Python string formatting") - elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( - pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) - ): - request.applymarker( - pytest.mark.xfail( - raises=TypeError, reason="Can only string multiply by an integer." - ) - ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: @@ -1073,14 +1053,6 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators, request): ), ) ) - elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( - pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) - ): - request.applymarker( - pytest.mark.xfail( - raises=TypeError, reason="Can only string multiply by an integer." - ) - ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index f800f734ec9d9..e44881a6d78ff 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -168,24 +168,15 @@ def test_fillna_no_op_returns_copy(self, data): def _get_expected_exception( self, op_name: str, obj, other - ) -> type[Exception] | None: - if op_name in ["__divmod__", "__rdivmod__"]: - if ( - isinstance(obj, pd.Series) - and cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow" - ): - # TODO: re-raise as TypeError? - return NotImplementedError - elif ( - isinstance(other, pd.Series) - and cast(StringDtype, tm.get_dtype(other)).storage == "pyarrow" - ): - # TODO: re-raise as TypeError? - return NotImplementedError - return TypeError - elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]: - if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow": - return NotImplementedError + ) -> type[Exception] | tuple[type[Exception], ...] | None: + if op_name in [ + "__mod__", + "__rmod__", + "__divmod__", + "__rdivmod__", + "__pow__", + "__rpow__", + ]: return TypeError elif op_name in ["__mul__", "__rmul__"]: # Can only multiply strings by integers @@ -198,11 +189,6 @@ def _get_expected_exception( "__sub__", "__rsub__", ]: - if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow": - import pyarrow as pa - - # TODO: better to re-raise as TypeError? - return pa.ArrowNotImplementedError return TypeError return None diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py index 2684704f86b82..f1163e994557f 100644 --- a/pandas/tests/frame/test_logical_ops.py +++ b/pandas/tests/frame/test_logical_ops.py @@ -4,10 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas import ( CategoricalIndex, DataFrame, @@ -100,9 +96,6 @@ def test_logical_ops_int_frame(self): res_ser = df1a_int["A"] | df1a_bool["A"] tm.assert_series_equal(res_ser, df1a_bool["A"]) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_logical_ops_invalid(self, using_infer_string): # GH#5808 @@ -114,15 +107,12 @@ def test_logical_ops_invalid(self, using_infer_string): df1 = DataFrame("foo", index=[1], columns=["A"]) df2 = DataFrame(True, index=[1], columns=["A"]) - msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'") - if using_infer_string: - import pyarrow as pa - - with pytest.raises(pa.lib.ArrowNotImplementedError, match="|has no kernel"): - df1 | df2 + if using_infer_string and df1["A"].dtype.storage == "pyarrow": + msg = "operation 'or_' not supported for dtype 'str'" else: - with pytest.raises(TypeError, match=msg): - df1 | df2 + msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'") + with pytest.raises(TypeError, match=msg): + df1 | df2 def test_logical_operators(self): def _check_bin_op(op): diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 8e1df679ee1b4..a76d33e922486 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -44,11 +44,6 @@ def test_neg_object(self, df, expected): tm.assert_frame_equal(-df, expected) tm.assert_series_equal(-df["a"], expected["a"]) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) @pytest.mark.parametrize( "df", [ @@ -59,22 +54,13 @@ def test_neg_object(self, df, expected): def test_neg_raises(self, df, using_infer_string): msg = ( "bad operand type for unary -: 'str'|" - r"bad operand type for unary -: 'DatetimeArray'" + r"bad operand type for unary -: 'DatetimeArray'|" + "unary '-' not supported for dtype" ) - if using_infer_string and df.dtypes.iloc[0] == "string": - import pyarrow as pa - - msg = "has no kernel" - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - (-df) - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - (-df["a"]) - - else: - with pytest.raises(TypeError, match=msg): - (-df) - with pytest.raises(TypeError, match=msg): - (-df["a"]) + with pytest.raises(TypeError, match=msg): + (-df) + with pytest.raises(TypeError, match=msg): + (-df["a"]) def test_invert(self, float_frame): df = float_frame diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index add2f3f18b348..322e6677fe05d 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -3,13 +3,10 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.missing import ( NA, is_matching_na, ) -from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -31,39 +28,25 @@ def test_get_indexer_strings(self, method, expected): tm.assert_numpy_array_equal(actual, expected) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_get_indexer_strings_raises(self, using_infer_string): index = Index(["b", "c"]) - if using_infer_string: - import pyarrow as pa - - msg = "has no kernel" - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="nearest") - - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) - - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - index.get_indexer( - ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] - ) - - else: - msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" - with pytest.raises(TypeError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="nearest") + msg = "|".join( + [ + "operation 'sub' not supported for dtype 'str'", + r"unsupported operand type\(s\) for -: 'str' and 'str'", + ] + ) + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="nearest") - with pytest.raises(TypeError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) - with pytest.raises(TypeError, match=msg): - index.get_indexer( - ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] - ) + with pytest.raises(TypeError, match=msg): + index.get_indexer( + ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] + ) def test_get_indexer_with_NA_values( self, unique_nulls_fixture, unique_nulls_fixture2 diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 8d859a61a2bd5..c17d4f54c36c5 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -853,7 +853,6 @@ def test_append_preserves_dtype(self, simple_index): alt = index.take(list(range(N)) * 2) tm.assert_index_equal(result, alt, check_exact=True) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_inv(self, simple_index, using_infer_string): idx = simple_index @@ -867,21 +866,14 @@ def test_inv(self, simple_index, using_infer_string): tm.assert_series_equal(res2, Series(expected)) else: if idx.dtype.kind == "f": - err = TypeError msg = "ufunc 'invert' not supported for the input types" - elif using_infer_string and idx.dtype == "string": - import pyarrow as pa - - err = pa.lib.ArrowNotImplementedError - msg = "has no kernel" else: - err = TypeError - msg = "bad operand" - with pytest.raises(err, match=msg): + msg = "bad operand|__invert__ is not supported for string dtype" + with pytest.raises(TypeError, match=msg): ~idx # check that we get the same behavior with Series - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): ~Series(idx) def test_is_boolean_is_deprecated(self, simple_index): diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 1ffc9ddca5adf..a65d7687cfb06 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas._libs.tslibs import IncompatibleFrequency @@ -214,9 +212,9 @@ def test_series_integer_mod(self, index): s1 = Series(range(1, 10)) s2 = Series("foo", index=index) - msg = "not all arguments converted during string formatting|mod not" + msg = "not all arguments converted during string formatting|'mod' not supported" - with pytest.raises((TypeError, NotImplementedError), match=msg): + with pytest.raises(TypeError, match=msg): s2 % s1 def test_add_with_duplicate_index(self): @@ -501,28 +499,14 @@ def test_ser_cmp_result_names(self, names, comparison_op): result = op(ser, cidx) assert result.name == names[2] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_comparisons(self, using_infer_string): + def test_comparisons(self): s = Series(["a", "b", "c"]) s2 = Series([False, True, False]) # it works! exp = Series([False, False, False]) - if using_infer_string: - import pyarrow as pa - - msg = "has no kernel" - # TODO(3.0) GH56008 - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - s == s2 - with tm.assert_produces_warning( - DeprecationWarning, match="comparison", check_stacklevel=False - ): - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - s2 == s - else: - tm.assert_series_equal(s == s2, exp) - tm.assert_series_equal(s2 == s, exp) + tm.assert_series_equal(s == s2, exp) + tm.assert_series_equal(s2 == s, exp) # ----------------------------------------------------------------- # Categorical Dtype Comparisons diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index a9f1726afc942..b9ddfc189edce 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -6,8 +6,6 @@ from pandas._config import using_string_dtype -from pandas.compat import HAS_PYARROW - from pandas import ( ArrowDtype, DataFrame, @@ -151,10 +149,7 @@ def test_logical_operators_int_dtype_with_bool(self): expected = Series([False, True, True, True]) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) - def test_logical_operators_int_dtype_with_object(self, using_infer_string): + def test_logical_operators_int_dtype_with_object(self): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") @@ -163,14 +158,10 @@ def test_logical_operators_int_dtype_with_object(self, using_infer_string): tm.assert_series_equal(result, expected) s_abNd = Series(["a", "b", np.nan, "d"]) - if using_infer_string: - import pyarrow as pa - - with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): - s_0123 & s_abNd - else: - with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): - s_0123 & s_abNd + with pytest.raises( + TypeError, match="unsupported.* 'int' and 'str'|'rand_' not supported" + ): + s_0123 & s_abNd def test_logical_operators_bool_dtype_with_int(self): index = list("bca") @@ -368,9 +359,7 @@ def test_reverse_ops_with_index(self, op, expected): result = op(ser, idx) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_logical_ops_label_based(self, using_infer_string): # GH#4947 # logical ops should be label based @@ -439,15 +428,12 @@ def test_logical_ops_label_based(self, using_infer_string): tm.assert_series_equal(result, a[a]) for e in [Series(["z"])]: - warn = FutureWarning if using_infer_string else None if using_infer_string: - import pyarrow as pa - - with tm.assert_produces_warning(warn, match="Operation between non"): - with pytest.raises( - pa.lib.ArrowNotImplementedError, match="has no kernel" - ): - result = a[a | e] + # TODO(infer_string) should this behave differently? + with pytest.raises( + TypeError, match="not supported for dtype|unsupported operand type" + ): + result = a[a | e] else: result = a[a | e] tm.assert_series_equal(result, a[a]) From 5783aa4d0de6e82be97b1e6a7b9f33bedc6e78dd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 27 Aug 2024 19:51:35 +0200 Subject: [PATCH 04/34] DOC: Add whatsnew for 2.3.0 (#59625) * DOC: Add whatsnew for 2.3.0 * fix duplicate label --- doc/source/whatsnew/index.rst | 8 ++ doc/source/whatsnew/v2.3.0.rst | 177 +++++++++++++++++++++++++++++++++ 2 files changed, 185 insertions(+) create mode 100644 doc/source/whatsnew/v2.3.0.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 09d76d71c6e1b..ae96d0f8296f2 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -10,6 +10,14 @@ This is the list of changes to pandas between each release. For full details, see the `commit logs `_. For install and upgrade instructions, see :ref:`install`. +Version 2.3 +----------- + +.. toctree:: + :maxdepth: 2 + + v2.3.0 + Version 2.2 ----------- diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst new file mode 100644 index 0000000000000..d1881bf04826f --- /dev/null +++ b/doc/source/whatsnew/v2.3.0.rst @@ -0,0 +1,177 @@ +.. _whatsnew_230: + +What's new in 2.3.0 (Month XX, 2024) +------------------------------------ + +These are the changes in pandas 2.3.0. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_230.upcoming_changes: + +Upcoming changes in pandas 3.0 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +.. _whatsnew_230.enhancements: + +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_230.enhancements.enhancement1: + +enhancement1 +^^^^^^^^^^^^ + + +.. _whatsnew_230.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.notable_bug_fixes: + +Notable bug fixes +~~~~~~~~~~~~~~~~~ + +These are bug fixes that might have notable behavior changes. + +.. _whatsnew_230.notable_bug_fixes.notable_bug_fix1: + +notable_bug_fix1 +^^^^^^^^^^^^^^^^ + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.deprecations: + +Deprecations +~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.performance: + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.bug_fixes: + +Bug fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ +- +- + +Datetimelike +^^^^^^^^^^^^ +- +- + +Timedelta +^^^^^^^^^ +- +- + +Timezones +^^^^^^^^^ +- +- + +Numeric +^^^^^^^ +- +- + +Conversion +^^^^^^^^^^ +- +- + +Strings +^^^^^^^ +- +- + +Interval +^^^^^^^^ +- +- + +Indexing +^^^^^^^^ +- +- + +Missing +^^^^^^^ +- +- + +MultiIndex +^^^^^^^^^^ +- +- + +I/O +^^^ +- +- + +Period +^^^^^^ +- +- + +Plotting +^^^^^^^^ +- +- + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ +- +- + +Reshaping +^^^^^^^^^ +- +- + +Sparse +^^^^^^ +- +- + +ExtensionArray +^^^^^^^^^^^^^^ +- +- + +Styler +^^^^^^ +- +- + +Other +^^^^^ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.contributors: + +Contributors +~~~~~~~~~~~~ From 1833ccbcd713303025e1332d200917ad960e3867 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 27 Aug 2024 15:54:15 -0700 Subject: [PATCH 05/34] BUG (string): str.replace with negative n (#59628) * BUG (string): str.replace with negative n * update GH ref --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/arrays/string_arrow.py | 4 +--- pandas/tests/extension/test_arrow.py | 11 +++++++++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index d1881bf04826f..528226502da33 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -102,7 +102,7 @@ Conversion Strings ^^^^^^^ -- +- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) - Interval diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 91c1f20ba93c6..5c6cca41be027 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -352,9 +352,7 @@ def _str_replace( fallback_performancewarning() return super()._str_replace(pat, repl, n, case, flags, regex) - func = pc.replace_substring_regex if regex else pc.replace_substring - result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) - return type(self)(result) + return ArrowExtensionArray._str_replace(self, pat, repl, n, case, flags, regex) def _str_repeat(self, repeats: int | Sequence[int]): if not isinstance(repeats, int): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0e8e1809d08ac..47d13b331843c 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1840,6 +1840,17 @@ def test_str_replace_negative_n(): expected = pd.Series(["bc", ""], dtype=ArrowDtype(pa.string())) tm.assert_series_equal(expected, actual) + # Same bug for pyarrow-backed StringArray GH#59628 + ser2 = ser.astype(pd.StringDtype(storage="pyarrow")) + actual2 = ser2.str.replace("a", "", -3, True) + expected2 = expected.astype(ser2.dtype) + tm.assert_series_equal(expected2, actual2) + + ser3 = ser.astype(pd.StringDtype(storage="pyarrow", na_value=np.nan)) + actual3 = ser3.str.replace("a", "", -3, True) + expected3 = expected.astype(ser3.dtype) + tm.assert_series_equal(expected3, actual3) + def test_str_repeat_unsupported(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) From 62b474bd7a600c7d060e8f262604ae7dd379fa6b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 28 Aug 2024 10:09:48 -0700 Subject: [PATCH 06/34] TST (string): fix xfailed groupby value_counts tests (#59632) --- .../groupby/methods/test_value_counts.py | 37 +++++++++++++------ 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 51232fac7d6f6..dc986d046ca41 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -8,9 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td from pandas import ( @@ -288,7 +285,6 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("groupby", ["column", "array", "function"]) @pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) @pytest.mark.parametrize( @@ -302,7 +298,16 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("frame", [True, False]) def test_against_frame_and_seriesgroupby( - education_df, groupby, normalize, name, sort, ascending, as_index, frame, request + education_df, + groupby, + normalize, + name, + sort, + ascending, + as_index, + frame, + request, + using_infer_string, ): # test all parameters: # - Use column, array or function as by= parameter @@ -366,17 +371,24 @@ def test_against_frame_and_seriesgroupby( index_frame["gender"] = index_frame["both"].str.split("-").str.get(0) index_frame["education"] = index_frame["both"].str.split("-").str.get(1) del index_frame["both"] - index_frame = index_frame.rename({0: None}, axis=1) - expected.index = MultiIndex.from_frame(index_frame) + index_frame2 = index_frame.rename({0: None}, axis=1) + expected.index = MultiIndex.from_frame(index_frame2) + + if index_frame2.columns.isna()[0]: + # with using_infer_string, the columns in index_frame as string + # dtype, which makes the rename({0: None}) above use np.nan + # instead of None, so we need to set None more explicitly. + expected.index.names = [None] + expected.index.names[1:] tm.assert_series_equal(result, expected) else: expected.insert(1, "gender", expected["both"].str.split("-").str.get(0)) expected.insert(2, "education", expected["both"].str.split("-").str.get(1)) + if using_infer_string: + expected = expected.astype({"gender": "str", "education": "str"}) del expected["both"] tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype", [ @@ -403,6 +415,7 @@ def test_compound( expected_count, expected_group_size, dtype, + using_infer_string, ): education_df = education_df.astype(dtype) education_df.columns = education_df.columns.astype(dtype) @@ -425,6 +438,11 @@ def test_compound( expected["count"] = expected_count if dtype == "string[pyarrow]": expected["count"] = expected["count"].convert_dtypes() + if using_infer_string and dtype == object: + expected = expected.astype( + {"country": "str", "gender": "str", "education": "str"} + ) + tm.assert_frame_equal(result, expected) @@ -537,9 +555,6 @@ def names_with_nulls_df(nulls_fixture): ) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize( "dropna, expected_data, expected_index", [ From 972369fbe417f6ebc69f8cef764851b0d27fc54b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 28 Aug 2024 10:16:45 -0700 Subject: [PATCH 07/34] REF (string): rename result converter methods (#59626) --- pandas/core/arrays/_arrow_string_mixins.py | 8 +++++ pandas/core/arrays/arrow/array.py | 6 ++++ pandas/core/arrays/string_arrow.py | 38 +++++++++++----------- 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index cc41985843574..a99c370e9d927 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -17,6 +17,14 @@ class ArrowStringArrayMixin: def __init__(self, *args, **kwargs) -> None: raise NotImplementedError + def _convert_bool_result(self, result): + # Convert a bool-dtype result to the appropriate result type + raise NotImplementedError + + def _convert_int_result(self, result): + # Convert an integer-dtype result to the appropriate result type + raise NotImplementedError + def _str_pad( self, width: int, diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 5f8963c81b0ba..f976d0b3745e8 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2285,6 +2285,12 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]: for chunk in self._pa_array.iterchunks() ] + def _convert_bool_result(self, result): + return type(self)(result) + + def _convert_int_result(self, result): + return type(self)(result) + def _str_count(self, pat: str, flags: int = 0): if flags: raise NotImplementedError(f"count not implemented with {flags=}") diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 5c6cca41be027..f524c8bc5d314 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -214,7 +214,7 @@ def insert(self, loc: int, item) -> ArrowStringArray: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) - def _result_converter(self, values, na=None): + def _convert_bool_result(self, values, na=None): if self.dtype.na_value is np.nan: if not isna(na): values = values.fill_null(bool(na)) @@ -296,7 +296,7 @@ def _str_contains( result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) else: result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = self._result_converter(result, na=na) + result = self._convert_bool_result(result, na=na) if not isna(na): result[isna(result)] = bool(na) return result @@ -318,7 +318,7 @@ def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) if not isna(na): result = result.fill_null(na) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): if isinstance(pat, str): @@ -337,7 +337,7 @@ def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) if not isna(na): result = result.fill_null(na) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_replace( self, @@ -389,43 +389,43 @@ def _str_slice( def _str_isalnum(self): result = pc.utf8_is_alnum(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_isalpha(self): result = pc.utf8_is_alpha(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_isdigit(self): result = pc.utf8_is_digit(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_islower(self): result = pc.utf8_is_lower(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_isnumeric(self): result = pc.utf8_is_numeric(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_isspace(self): result = pc.utf8_is_space(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_istitle(self): result = pc.utf8_is_title(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_isupper(self): result = pc.utf8_is_upper(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_len(self): result = pc.utf8_length(self._pa_array) - return self._convert_int_dtype(result) + return self._convert_int_result(result) def _str_lower(self): return type(self)(pc.utf8_lower(self._pa_array)) @@ -472,7 +472,7 @@ def _str_count(self, pat: str, flags: int = 0): if flags: return super()._str_count(pat, flags) result = pc.count_substring_regex(self._pa_array, pat) - return self._convert_int_dtype(result) + return self._convert_int_result(result) def _str_find(self, sub: str, start: int = 0, end: int | None = None): if start != 0 and end is not None: @@ -486,7 +486,7 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): result = pc.find_substring(slices, sub) else: return super()._str_find(sub, start, end) - return self._convert_int_dtype(result) + return self._convert_int_result(result) def _str_get_dummies(self, sep: str = "|"): dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep) @@ -495,7 +495,7 @@ def _str_get_dummies(self, sep: str = "|"): dummies = np.vstack(dummies_pa.to_numpy()) return dummies.astype(np.int64, copy=False), labels - def _convert_int_dtype(self, result): + def _convert_int_result(self, result): if self.dtype.na_value is np.nan: if isinstance(result, pa.Array): result = result.to_numpy(zero_copy_only=False) @@ -522,7 +522,7 @@ def _reduce( result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) if name in ("argmin", "argmax") and isinstance(result, pa.Array): - return self._convert_int_dtype(result) + return self._convert_int_result(result) elif isinstance(result, pa.Array): return type(self)(result) else: @@ -540,7 +540,7 @@ def _rank( """ See Series.rank.__doc__. """ - return self._convert_int_dtype( + return self._convert_int_result( self._rank_calc( axis=axis, method=method, From b350a976fd14d779cd3042dd4a56361ec62e6038 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 28 Aug 2024 12:52:56 -0700 Subject: [PATCH 08/34] TST (string) fix xfailed groupby tests (3) (#59642) * TST (string) fix xfailed groupby tests (3) * TST: non-pyarrow build --- pandas/tests/groupby/methods/test_describe.py | 8 ++--- pandas/tests/groupby/methods/test_nth.py | 6 ++-- pandas/tests/groupby/test_groupby_dropna.py | 16 ---------- .../tests/groupby/transform/test_transform.py | 29 +++++++++++++------ 4 files changed, 24 insertions(+), 35 deletions(-) diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py index 34b046bff7c91..c80063e673b81 100644 --- a/pandas/tests/groupby/methods/test_describe.py +++ b/pandas/tests/groupby/methods/test_describe.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -73,7 +71,6 @@ def test_series_describe_as_index(as_index, keys): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_describe_multikey(tsframe): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() @@ -82,7 +79,7 @@ def test_frame_describe_multikey(tsframe): group = grouped[col].describe() # GH 17464 - Remove duplicate MultiIndex levels group_col = MultiIndex( - levels=[[col], group.columns], + levels=[Index([col], dtype=tsframe.columns.dtype), group.columns], codes=[[0] * len(group.columns), range(len(group.columns))], ) group = DataFrame(group.values, columns=group_col, index=group.index) @@ -275,7 +272,6 @@ def test_describe(self, df, gb, gni): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [int, float, object]) @pytest.mark.parametrize( "kwargs", @@ -297,5 +293,5 @@ def test_groupby_empty_dataset(dtype, kwargs): result = df.iloc[:0].groupby("A").B.describe(**kwargs) expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0] - expected.index = Index([]) + expected.index = Index([], dtype=df.columns.dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_nth.py b/pandas/tests/groupby/methods/test_nth.py index 344258257ba80..2722993ee5cdf 100644 --- a/pandas/tests/groupby/methods/test_nth.py +++ b/pandas/tests/groupby/methods/test_nth.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -706,14 +704,14 @@ def test_first_multi_key_groupby_categorical(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("method", ["first", "last", "nth"]) def test_groupby_last_first_nth_with_none(method, nulls_fixture): # GH29645 - expected = Series(["y"]) + expected = Series(["y"], dtype=object) data = Series( [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture], index=[0, 0, 0, 0, 0], + dtype=object, ).groupby(level=0) if method == "nth": diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 3856a0d8928a7..9c01e017dd29c 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -3,7 +3,6 @@ from pandas._config import using_string_dtype -from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.missing import na_value_for_dtype @@ -13,9 +12,6 @@ from pandas.tests.groupby import get_groupby_method_args -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -59,9 +55,6 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_one_group( tm.assert_frame_equal(grouped, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -138,9 +131,6 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): tm.assert_frame_equal(grouped, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize( "dropna, idx, expected", [ @@ -216,9 +206,6 @@ def test_groupby_dataframe_slice_then_transform(dropna, index): tm.assert_series_equal(result, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -300,9 +287,6 @@ def test_groupby_dropna_datetime_like_data( tm.assert_frame_equal(grouped, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize( "dropna, data, selected_data, levels", [ diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index a5433d5496b0b..5823656a610e5 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -5,6 +5,7 @@ from pandas._config import using_string_dtype from pandas._libs import lib +from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import ensure_platform_int @@ -499,8 +500,7 @@ def test_transform_select_columns(df): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_transform_nuisance_raises(df): +def test_transform_nuisance_raises(df, using_infer_string): # case that goes through _transform_item_by_item df.columns = ["A", "B", "B", "D"] @@ -510,10 +510,16 @@ def test_transform_nuisance_raises(df): grouped = df.groupby("A") gbc = grouped["B"] - with pytest.raises(TypeError, match="Could not convert"): + msg = "Could not convert" + if using_infer_string: + if df.columns.dtype.storage == "pyarrow": + msg = "with dtype str does not support operation 'mean'" + else: + msg = "Cannot perform reduction 'mean' with string dtype" + with pytest.raises(TypeError, match=msg): gbc.transform(lambda x: np.mean(x)) - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match=msg): df.groupby("A").transform(lambda x: np.mean(x)) @@ -582,8 +588,7 @@ def test_transform_coercion(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_groupby_transform_with_int(): +def test_groupby_transform_with_int(using_infer_string): # GH 3740, make sure that we might upcast on item-by-item transform # floats @@ -613,8 +618,14 @@ def test_groupby_transform_with_int(): "D": "foo", } ) + msg = "Could not convert" + if using_infer_string: + if HAS_PYARROW: + msg = "with dtype str does not support operation 'mean'" + else: + msg = "Cannot perform reduction 'mean' with string dtype" with np.errstate(all="ignore"): - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match=msg): df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) result = df.groupby("A")[["B", "C"]].transform( lambda x: (x - x.mean()) / x.std() @@ -626,7 +637,7 @@ def test_groupby_transform_with_int(): s = Series([2, 3, 4, 10, 5, -1]) df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"}) with np.errstate(all="ignore"): - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match=msg): df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) result = df.groupby("A")[["B", "C"]].transform( lambda x: (x - x.mean()) / x.std() @@ -850,7 +861,6 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.slow @pytest.mark.parametrize( "op, args, targop", @@ -901,6 +911,7 @@ def test_cython_transform_frame_column( "does not support .* operations", ".* is not supported for object dtype", "is not implemented for this dtype", + ".* is not supported for str dtype", ] ) with pytest.raises(TypeError, match=msg): From 3121121307951743cf27c540b1edf4300b0aab16 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 29 Aug 2024 05:27:10 -0700 Subject: [PATCH 09/34] REF (string): de-duplicate str_endswith, startswith (#59568) --- pandas/core/arrays/_arrow_string_mixins.py | 48 +++++++++++++++++++++- pandas/core/arrays/arrow/array.py | 33 +-------------- pandas/core/arrays/string_arrow.py | 40 +----------------- 3 files changed, 49 insertions(+), 72 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index a99c370e9d927..9b84ddb7cfe55 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -1,18 +1,28 @@ from __future__ import annotations -from typing import Literal +from typing import ( + TYPE_CHECKING, + Literal, +) import numpy as np from pandas.compat import pa_version_under10p1 +from pandas.core.dtypes.missing import isna + if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc +if TYPE_CHECKING: + from collections.abc import Sized + + from pandas._typing import Scalar + class ArrowStringArrayMixin: - _pa_array = None + _pa_array: Sized def __init__(self, *args, **kwargs) -> None: raise NotImplementedError @@ -90,3 +100,37 @@ def _str_removesuffix(self, suffix: str): removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) result = pc.if_else(ends_with, removed, self._pa_array) return type(self)(result) + + def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + if isinstance(pat, str): + result = pc.starts_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple we return null for missing values and False + # for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.starts_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) + if not isna(na): # pyright: ignore [reportGeneralTypeIssues] + result = result.fill_null(na) + return self._convert_bool_result(result) + + def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + if isinstance(pat, str): + result = pc.ends_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple we return null for missing values and False + # for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.ends_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) + if not isna(na): # pyright: ignore [reportGeneralTypeIssues] + result = result.fill_null(na) + return self._convert_bool_result(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index f976d0b3745e8..220ce96c22a13 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2311,38 +2311,7 @@ def _str_contains( result = result.fill_null(na) return type(self)(result) - def _str_startswith(self, pat: str | tuple[str, ...], na=None): - if isinstance(pat, str): - result = pc.starts_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return type(self)(result) - - def _str_endswith(self, pat: str | tuple[str, ...], na=None): - if isinstance(pat, str): - result = pc.ends_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) + def _result_converter(self, result): return type(self)(result) def _str_replace( diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index f524c8bc5d314..a8590d3c9b526 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -284,6 +284,8 @@ def _data(self): # String methods interface _str_map = BaseStringArray._str_map + _str_startswith = ArrowStringArrayMixin._str_startswith + _str_endswith = ArrowStringArrayMixin._str_endswith def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -301,44 +303,6 @@ def _str_contains( result[isna(result)] = bool(na) return result - def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): - if isinstance(pat, str): - result = pc.starts_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) - ) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return self._convert_bool_result(result) - - def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): - if isinstance(pat, str): - result = pc.ends_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) - ) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return self._convert_bool_result(result) - def _str_replace( self, pat: str | re.Pattern, From 866a7f6776b651f3f634a1d640bc715919d7f558 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 31 Aug 2024 09:46:27 -0700 Subject: [PATCH 10/34] DEPR (string): non-bool na for obj.str.contains (#59615) Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/arrays/string_arrow.py | 8 ++++ pandas/core/strings/object_array.py | 26 +++++++++++ pandas/tests/strings/test_find_replace.py | 55 +++++++++++++++++++++-- 4 files changed, 87 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 528226502da33..8a64aa7c609d6 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -53,7 +53,7 @@ notable_bug_fix1 Deprecations ~~~~~~~~~~~~ -- +- Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index a8590d3c9b526..6ae6e75bbf00d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -300,6 +300,14 @@ def _str_contains( result = pc.match_substring(self._pa_array, pat, ignore_case=not case) result = self._convert_bool_result(result, na=na) if not isna(na): + if not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.contains is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) result[isna(result)] = bool(na) return result diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 090e27ec58cc3..f376c239a0ce0 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -10,12 +10,14 @@ cast, ) import unicodedata +import warnings import numpy as np from pandas._libs import lib import pandas._libs.missing as libmissing import pandas._libs.ops as libops +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.missing import isna @@ -140,14 +142,38 @@ def _str_contains( else: upper_pat = pat.upper() f = lambda x: upper_pat in x.upper() + if not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.contains is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na, dtype=np.dtype("bool")) def _str_startswith(self, pat, na=None): f = lambda x: x.startswith(pat) + if not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.startswith is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_endswith(self, pat, na=None): f = lambda x: x.endswith(pat) + if not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.endswith is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_replace( diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 78ce1d7418886..8c5a9b39157ea 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -4,6 +4,9 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td @@ -167,7 +170,16 @@ def test_contains_na_kwarg_for_nullable_string_dtype( # https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416 values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype) - result = values.str.contains("a", na=na, regex=regex) + + msg = ( + "Allowing a non-bool 'na' in obj.str.contains is deprecated and " + "will raise in a future version" + ) + warn = None + if not pd.isna(na) and not isinstance(na, bool): + warn = FutureWarning + with tm.assert_produces_warning(warn, match=msg): + result = values.str.contains("a", na=na, regex=regex) expected = Series([True, False, False, True, expected], dtype="boolean") tm.assert_series_equal(result, expected) @@ -233,6 +245,7 @@ def test_contains_nan(any_string_dtype): expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) + # TODO(infer_string) # this particular combination of events is broken on 2.3 # would require cherry picking #58483, which in turn requires #57481 # which introduce many behavioral changes @@ -241,14 +254,19 @@ def test_contains_nan(any_string_dtype): and any_string_dtype.storage == "python" and any_string_dtype.na_value is np.nan ): - result = s.str.contains("foo", na="foo") + msg = ( + "Allowing a non-bool 'na' in obj.str.contains is deprecated and " + "will raise in a future version" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) elif any_string_dtype.na_value is np.nan: expected = Series([True, True, True], dtype=np.bool_) else: expected = Series([True, True, True], dtype="boolean") - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.str.contains("foo") expected_dtype = ( @@ -263,6 +281,37 @@ def test_contains_nan(any_string_dtype): # -------------------------------------------------------------------------------------- +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) +def test_startswith_endswith_validate_na(any_string_dtype): + # GH#59615 + ser = Series( + ["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"], + dtype=any_string_dtype, + ) + + dtype = ser.dtype + if ( + isinstance(dtype, pd.StringDtype) and dtype.storage == "python" + ) or dtype == np.dtype("object"): + msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.startswith("kapow", na="baz") + msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.endswith("bar", na="baz") + else: + # TODO(infer_string): don't surface pyarrow errors + import pyarrow as pa + + msg = "Could not convert 'baz' with type str: tried to convert to boolean" + with pytest.raises(pa.lib.ArrowInvalid, match=msg): + ser.str.startswith("kapow", na="baz") + with pytest.raises(pa.lib.ArrowInvalid, match=msg): + ser.str.endswith("kapow", na="baz") + + @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) @pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) From b313cf5ce3559febe1fcb93de83d4cd62cb9db6f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Sep 2024 20:00:59 +0200 Subject: [PATCH 11/34] TST (string dtype): fix and clean up arrow roundtrip tests (#59678) * TST (string dtype): fix and clean up arrow roundtrip tests * fix using_infer_string --- pandas/tests/arrays/masked/test_arrow_compat.py | 11 +++-------- pandas/tests/arrays/string_/test_string.py | 14 ++++++++++---- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 31765165f5f16..293ee4095d02e 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -1,17 +1,12 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 1296cc3b5a494..4c53dabcdbf7a 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -524,7 +524,6 @@ def test_arrow_array(dtype): assert arr.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_roundtrip(dtype, string_storage, using_infer_string): # roundtrip possible from arrow 1.0.0 @@ -543,13 +542,16 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert result["a"].dtype == "object" else: assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage}]") + expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value)) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is result["a"].dtype.na_value -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 @@ -571,7 +573,11 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): assert result["a"].dtype == "object" else: assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage}]") + expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value)) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) tm.assert_frame_equal(result, expected) From 449a09491b2bacc5812e30e3989d93f9c7d1f523 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Sep 2024 11:47:39 -0700 Subject: [PATCH 12/34] API (string): str.center with pyarrow-backed string dtype (#59624) --- doc/source/whatsnew/v2.3.0.rst | 3 ++- pandas/core/arrays/_arrow_string_mixins.py | 20 ++++++++++++++++++-- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/strings/test_case_justify.py | 6 +----- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 8a64aa7c609d6..03355f655eb28 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -103,7 +103,8 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) -- +- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) + Interval ^^^^^^^^ diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 9b84ddb7cfe55..e8051c803676c 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import partial from typing import ( TYPE_CHECKING, Literal, @@ -7,7 +8,10 @@ import numpy as np -from pandas.compat import pa_version_under10p1 +from pandas.compat import ( + pa_version_under10p1, + pa_version_under17p0, +) from pandas.core.dtypes.missing import isna @@ -46,7 +50,19 @@ def _str_pad( elif side == "right": pa_pad = pc.utf8_rpad elif side == "both": - pa_pad = pc.utf8_center + if pa_version_under17p0: + # GH#59624 fall back to object dtype + from pandas import array + + obj_arr = self.astype(object, copy=False) # type: ignore[attr-defined] + obj = array(obj_arr, dtype=object) + result = obj._str_pad(width, side, fillchar) # type: ignore[attr-defined] + return type(self)._from_sequence(result, dtype=self.dtype) # type: ignore[attr-defined] + else: + # GH#54792 + # https://github.com/apache/arrow/issues/15053#issuecomment-2317032347 + lean_left = (width % 2) == 0 + pa_pad = partial(pc.utf8_center, lean_left_on_odd_padding=lean_left) else: raise ValueError( f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6ae6e75bbf00d..e4fcf6775e8f4 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -286,6 +286,7 @@ def _data(self): _str_map = BaseStringArray._str_map _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith + _str_pad = ArrowStringArrayMixin._str_pad def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -546,7 +547,6 @@ class ArrowStringArrayNumpySemantics(ArrowStringArray): _str_get = ArrowStringArrayMixin._str_get _str_removesuffix = ArrowStringArrayMixin._str_removesuffix _str_capitalize = ArrowStringArrayMixin._str_capitalize - _str_pad = ArrowStringArrayMixin._str_pad _str_title = ArrowStringArrayMixin._str_title _str_swapcase = ArrowStringArrayMixin._str_swapcase _str_slice_replace = ArrowStringArrayMixin._str_slice_replace diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index 41aedae90ca76..819556f961fa3 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -291,11 +291,7 @@ def test_center_ljust_rjust_mixed_object(): def test_center_ljust_rjust_fillchar(any_string_dtype): - if any_string_dtype == "string[pyarrow_numpy]": - pytest.skip( - "Arrow logic is different, " - "see https://github.com/pandas-dev/pandas/pull/54533/files#r1299808126", - ) + # GH#54533, GH#54792 s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype) result = s.str.center(5, fillchar="X") From 63dbe97bb71eaf37cc8a6942e1d8279360c6417e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Sep 2024 09:27:00 -0700 Subject: [PATCH 13/34] REF (string): de-duplicate str_isfoo methods (#59705) --- pandas/core/arrays/_arrow_string_mixins.py | 40 ++++++++++++++++++- pandas/core/arrays/arrow/array.py | 27 ------------- pandas/core/arrays/string_arrow.py | 46 +++++----------------- 3 files changed, 48 insertions(+), 65 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index e8051c803676c..7f3e6eb67249e 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -52,10 +52,10 @@ def _str_pad( elif side == "both": if pa_version_under17p0: # GH#59624 fall back to object dtype - from pandas import array + from pandas import array as pd_array obj_arr = self.astype(object, copy=False) # type: ignore[attr-defined] - obj = array(obj_arr, dtype=object) + obj = pd_array(obj_arr, dtype=object) result = obj._str_pad(width, side, fillchar) # type: ignore[attr-defined] return type(self)._from_sequence(result, dtype=self.dtype) # type: ignore[attr-defined] else: @@ -150,3 +150,39 @@ def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): if not isna(na): # pyright: ignore [reportGeneralTypeIssues] result = result.fill_null(na) return self._convert_bool_result(result) + + def _str_isalnum(self): + result = pc.utf8_is_alnum(self._pa_array) + return self._convert_bool_result(result) + + def _str_isalpha(self): + result = pc.utf8_is_alpha(self._pa_array) + return self._convert_bool_result(result) + + def _str_isdecimal(self): + result = pc.utf8_is_decimal(self._pa_array) + return self._convert_bool_result(result) + + def _str_isdigit(self): + result = pc.utf8_is_digit(self._pa_array) + return self._convert_bool_result(result) + + def _str_islower(self): + result = pc.utf8_is_lower(self._pa_array) + return self._convert_bool_result(result) + + def _str_isnumeric(self): + result = pc.utf8_is_numeric(self._pa_array) + return self._convert_bool_result(result) + + def _str_isspace(self): + result = pc.utf8_is_space(self._pa_array) + return self._convert_bool_result(result) + + def _str_istitle(self): + result = pc.utf8_is_title(self._pa_array) + return self._convert_bool_result(result) + + def _str_isupper(self): + result = pc.utf8_is_upper(self._pa_array) + return self._convert_bool_result(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 220ce96c22a13..4cd8f7f9505d6 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2411,33 +2411,6 @@ def _str_slice( pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) ) - def _str_isalnum(self): - return type(self)(pc.utf8_is_alnum(self._pa_array)) - - def _str_isalpha(self): - return type(self)(pc.utf8_is_alpha(self._pa_array)) - - def _str_isdecimal(self): - return type(self)(pc.utf8_is_decimal(self._pa_array)) - - def _str_isdigit(self): - return type(self)(pc.utf8_is_digit(self._pa_array)) - - def _str_islower(self): - return type(self)(pc.utf8_is_lower(self._pa_array)) - - def _str_isnumeric(self): - return type(self)(pc.utf8_is_numeric(self._pa_array)) - - def _str_isspace(self): - return type(self)(pc.utf8_is_space(self._pa_array)) - - def _str_istitle(self): - return type(self)(pc.utf8_is_title(self._pa_array)) - - def _str_isupper(self): - return type(self)(pc.utf8_is_upper(self._pa_array)) - def _str_len(self): return type(self)(pc.utf8_length(self._pa_array)) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e4fcf6775e8f4..a806ee86999c2 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -283,6 +283,16 @@ def _data(self): # ------------------------------------------------------------------------ # String methods interface + _str_isalnum = ArrowStringArrayMixin._str_isalnum + _str_isalpha = ArrowStringArrayMixin._str_isalpha + _str_isdecimal = ArrowStringArrayMixin._str_isdecimal + _str_isdigit = ArrowStringArrayMixin._str_isdigit + _str_islower = ArrowStringArrayMixin._str_islower + _str_isnumeric = ArrowStringArrayMixin._str_isnumeric + _str_isspace = ArrowStringArrayMixin._str_isspace + _str_istitle = ArrowStringArrayMixin._str_istitle + _str_isupper = ArrowStringArrayMixin._str_isupper + _str_map = BaseStringArray._str_map _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith @@ -360,42 +370,6 @@ def _str_slice( pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) ) - def _str_isalnum(self): - result = pc.utf8_is_alnum(self._pa_array) - return self._convert_bool_result(result) - - def _str_isalpha(self): - result = pc.utf8_is_alpha(self._pa_array) - return self._convert_bool_result(result) - - def _str_isdecimal(self): - result = pc.utf8_is_decimal(self._pa_array) - return self._convert_bool_result(result) - - def _str_isdigit(self): - result = pc.utf8_is_digit(self._pa_array) - return self._convert_bool_result(result) - - def _str_islower(self): - result = pc.utf8_is_lower(self._pa_array) - return self._convert_bool_result(result) - - def _str_isnumeric(self): - result = pc.utf8_is_numeric(self._pa_array) - return self._convert_bool_result(result) - - def _str_isspace(self): - result = pc.utf8_is_space(self._pa_array) - return self._convert_bool_result(result) - - def _str_istitle(self): - result = pc.utf8_is_title(self._pa_array) - return self._convert_bool_result(result) - - def _str_isupper(self): - result = pc.utf8_is_upper(self._pa_array) - return self._convert_bool_result(result) - def _str_len(self): result = pc.utf8_length(self._pa_array) return self._convert_int_result(result) From 2f4af6b63c388c453e472241788f9f84675c07fb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Sep 2024 09:38:26 -0700 Subject: [PATCH 14/34] TST (string): copy/view tests (#59702) --- pandas/core/dtypes/dtypes.py | 2 +- pandas/tests/copy_view/test_constructors.py | 5 +--- pandas/tests/copy_view/test_functions.py | 32 ++++++++++----------- pandas/tests/copy_view/test_internals.py | 10 +++---- pandas/tests/dtypes/test_dtypes.py | 3 -- 5 files changed, 23 insertions(+), 29 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 1c43ef55c11d7..c6ca24d19b906 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -453,7 +453,7 @@ def __eq__(self, other: object) -> bool: # Because left and right have the same length and are unique, # `indexer` not having any -1s implies that there is a # bijection between `left` and `right`. - return (indexer != -1).all() + return bool((indexer != -1).all()) # With object-dtype we need a comparison that identifies # e.g. int(2) as distinct from float(2) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 866b1964a334f..66c9b456f18ad 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -285,10 +283,9 @@ def test_dataframe_from_dict_of_series_with_reindex(dtype): assert np.shares_memory(arr_before, arr_after) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("cons", [Series, Index]) @pytest.mark.parametrize( - "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)] + "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], object)] ) def test_dataframe_from_series_or_index( using_copy_on_write, warn_copy_on_write, data, dtype, cons diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index a87baaedb9244..23ed7f9edcd22 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -16,10 +16,9 @@ from pandas.tests.copy_view.util import get_array -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_concat_frames(using_copy_on_write): - df = DataFrame({"b": ["a"] * 3}) - df2 = DataFrame({"a": ["a"] * 3}) + df = DataFrame({"b": ["a"] * 3}, dtype=object) + df2 = DataFrame({"a": ["a"] * 3}, dtype=object) df_orig = df.copy() result = concat([df, df2], axis=1) @@ -41,10 +40,9 @@ def test_concat_frames(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_concat_frames_updating_input(using_copy_on_write): - df = DataFrame({"b": ["a"] * 3}) - df2 = DataFrame({"a": ["a"] * 3}) + df = DataFrame({"b": ["a"] * 3}, dtype=object) + df2 = DataFrame({"a": ["a"] * 3}, dtype=object) result = concat([df, df2], axis=1) if using_copy_on_write: @@ -203,7 +201,7 @@ def test_concat_copy_keyword(using_copy_on_write, copy): assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") +# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize( "func", [ @@ -212,8 +210,8 @@ def test_concat_copy_keyword(using_copy_on_write, copy): ], ) def test_merge_on_key(using_copy_on_write, func): - df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}) - df2 = DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]}) + df1 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "a": [1, 2, 3]}) + df2 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "b": [4, 5, 6]}) df1_orig = df1.copy() df2_orig = df2.copy() @@ -267,7 +265,6 @@ def test_merge_on_index(using_copy_on_write): tm.assert_frame_equal(df2, df2_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "func, how", [ @@ -276,8 +273,8 @@ def test_merge_on_index(using_copy_on_write): ], ) def test_merge_on_key_enlarging_one(using_copy_on_write, func, how): - df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}) - df2 = DataFrame({"key": ["a", "b"], "b": [4, 5]}) + df1 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "a": [1, 2, 3]}) + df2 = DataFrame({"key": Series(["a", "b"], dtype=object), "b": [4, 5]}) df1_orig = df1.copy() df2_orig = df2.copy() @@ -321,9 +318,13 @@ def test_merge_copy_keyword(using_copy_on_write, copy): assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") +@pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, + reason="TODO(infer_string); result.index infers str dtype while both " + "df1 and df2 index are object.", +) def test_join_on_key(using_copy_on_write): - df_index = Index(["a", "b", "c"], name="key") + df_index = Index(["a", "b", "c"], name="key", dtype=object) df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True)) df2 = DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True)) @@ -355,9 +356,8 @@ def test_join_on_key(using_copy_on_write): tm.assert_frame_equal(df2, df2_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_join_multiple_dataframes_on_key(using_copy_on_write): - df_index = Index(["a", "b", "c"], name="key") + df_index = Index(["a", "b", "c"], name="key", dtype=object) df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True)) dfs_list = [ diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index 6f7198520d22e..8526d38588897 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -1,12 +1,13 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame +from pandas import ( + DataFrame, + Series, +) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -78,7 +79,6 @@ def test_switch_options(): @td.skip_array_manager_invalid_test -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [np.intp, np.int8]) @pytest.mark.parametrize( "locs, arr", @@ -105,7 +105,7 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype): "c": [7, 8, 9], "d": [10, 11, 12], "e": [13, 14, 15], - "f": ["a", "b", "c"], + "f": Series(["a", "b", "c"], dtype=object), }, ) arr = arr.astype(dtype) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index a4916ed1bbd8a..a5666e169fb4c 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -5,8 +5,6 @@ import pytest import pytz -from pandas._config import using_string_dtype - from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas.core.dtypes.base import _registry as registry @@ -961,7 +959,6 @@ def test_same_categories_different_order(self): c2 = CategoricalDtype(["b", "a"], ordered=True) assert c1 is not c2 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("ordered1", [True, False, None]) @pytest.mark.parametrize("ordered2", [True, False, None]) def test_categorical_equality(self, ordered1, ordered2): From c807defa7413399dd76a98eb4d2eaec81996af39 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Sep 2024 12:44:42 -0700 Subject: [PATCH 15/34] TST (string): more targeted xfails in test_string.py (#59703) * TST (string): more targeted xfails in test_string.py * Fix no-pyarrow test * Update pandas/tests/extension/test_string.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/tests/extension/test_string.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/extension/test_string.py | 36 +++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index e44881a6d78ff..7f04858318013 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -21,7 +21,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW import pandas as pd import pandas._testing as tm @@ -30,10 +30,6 @@ from pandas.core.arrays.string_ import StringDtype from pandas.tests.extension import base -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - def maybe_split_array(arr, chunked): if not chunked: @@ -220,6 +216,36 @@ def test_compare_scalar(self, data, comparison_op): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) + def test_combine_add(self, data_repeated, using_infer_string, request): + dtype = next(data_repeated(1)).dtype + if using_infer_string and ( + (dtype.na_value is pd.NA) and dtype.storage == "python" + ): + mark = pytest.mark.xfail( + reason="The pointwise operation result will be inferred to " + "string[nan, pyarrow], which does not match the input dtype" + ) + request.applymarker(mark) + super().test_combine_add(data_repeated) + + def test_arith_series_with_array( + self, data, all_arithmetic_operators, using_infer_string, request + ): + dtype = data.dtype + if ( + using_infer_string + and all_arithmetic_operators == "__radd__" + and ( + (dtype.na_value is pd.NA) or (dtype.storage == "python" and HAS_PYARROW) + ) + ): + mark = pytest.mark.xfail( + reason="The pointwise operation result will be inferred to " + "string[nan, pyarrow], which does not match the input dtype" + ) + request.applymarker(mark) + super().test_arith_series_with_array(data, all_arithmetic_operators) + class Test2DCompat(base.Dim2CompatTests): @pytest.fixture(autouse=True) From 553780a7ec6b9a259d0cb4a25870bfbbf63b232a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 5 Sep 2024 16:21:06 -0700 Subject: [PATCH 16/34] REF (string): de-duplicate _str_contains (#59709) * REF: de-duplicate _str_contains * pyright ignore --- pandas/core/arrays/_arrow_string_mixins.py | 15 +++++++++++++++ pandas/core/arrays/arrow/array.py | 15 --------------- pandas/core/arrays/string_arrow.py | 14 ++++---------- 3 files changed, 19 insertions(+), 25 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 7f3e6eb67249e..1a90e4e876faf 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -186,3 +186,18 @@ def _str_istitle(self): def _str_isupper(self): result = pc.utf8_is_upper(self._pa_array) return self._convert_bool_result(result) + + def _str_contains( + self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True + ): + if flags: + raise NotImplementedError(f"contains not implemented with {flags=}") + + if regex: + pa_contains = pc.match_substring_regex + else: + pa_contains = pc.match_substring + result = pa_contains(self._pa_array, pat, ignore_case=not case) + if not isna(na): # pyright: ignore [reportGeneralTypeIssues] + result = result.fill_null(na) + return self._convert_bool_result(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4cd8f7f9505d6..d4aaef7eced83 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2296,21 +2296,6 @@ def _str_count(self, pat: str, flags: int = 0): raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True - ): - if flags: - raise NotImplementedError(f"contains not implemented with {flags=}") - - if regex: - pa_contains = pc.match_substring_regex - else: - pa_contains = pc.match_substring - result = pa_contains(self._pa_array, pat, ignore_case=not case) - if not isna(na): - result = result.fill_null(na) - return type(self)(result) - def _result_converter(self, result): return type(self)(result) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index a806ee86999c2..a3c2659beced0 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -214,10 +214,8 @@ def insert(self, loc: int, item) -> ArrowStringArray: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) - def _convert_bool_result(self, values, na=None): + def _convert_bool_result(self, values): if self.dtype.na_value is np.nan: - if not isna(na): - values = values.fill_null(bool(na)) return ArrowExtensionArray(values).to_numpy(na_value=np.nan) return BooleanDtype().__from_arrow__(values) @@ -305,11 +303,6 @@ def _str_contains( fallback_performancewarning() return super()._str_contains(pat, case, flags, na, regex) - if regex: - result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) - else: - result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = self._convert_bool_result(result, na=na) if not isna(na): if not isinstance(na, bool): # GH#59561 @@ -319,8 +312,9 @@ def _str_contains( FutureWarning, stacklevel=find_stack_level(), ) - result[isna(result)] = bool(na) - return result + na = bool(na) + + return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) def _str_replace( self, From 44325c1d9c774c3f897a7dfd2e4e9919d083b778 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 6 Sep 2024 08:06:15 -0700 Subject: [PATCH 17/34] BUG (string): ArrowStringArray.find corner cases (#59562) --- pandas/core/arrays/_arrow_string_mixins.py | 44 +++++++++++++++++- pandas/core/arrays/arrow/array.py | 17 ------- pandas/core/arrays/string_arrow.py | 18 +++----- pandas/tests/extension/test_arrow.py | 52 ++++++++++++++++++++-- 4 files changed, 99 insertions(+), 32 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 1a90e4e876faf..4829b175783ed 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -3,6 +3,7 @@ from functools import partial from typing import ( TYPE_CHECKING, + Any, Literal, ) @@ -10,6 +11,7 @@ from pandas.compat import ( pa_version_under10p1, + pa_version_under13p0, pa_version_under17p0, ) @@ -20,7 +22,10 @@ import pyarrow.compute as pc if TYPE_CHECKING: - from collections.abc import Sized + from collections.abc import ( + Callable, + Sized, + ) from pandas._typing import Scalar @@ -39,6 +44,9 @@ def _convert_int_result(self, result): # Convert an integer-dtype result to the appropriate result type raise NotImplementedError + def _apply_elementwise(self, func: Callable) -> list[list[Any]]: + raise NotImplementedError + def _str_pad( self, width: int, @@ -201,3 +209,37 @@ def _str_contains( if not isna(na): # pyright: ignore [reportGeneralTypeIssues] result = result.fill_null(na) return self._convert_bool_result(result) + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if ( + pa_version_under13p0 + and not (start != 0 and end is not None) + and not (start == 0 and end is None) + ): + # GH#59562 + res_list = self._apply_elementwise(lambda val: val.find(sub, start, end)) + return self._convert_int_result(pa.chunked_array(res_list)) + + if (start == 0 or start is None) and end is None: + result = pc.find_substring(self._pa_array, sub) + else: + if sub == "": + # GH#56792 + res_list = self._apply_elementwise( + lambda val: val.find(sub, start, end) + ) + return self._convert_int_result(pa.chunked_array(res_list)) + if start is None: + start_offset = 0 + start = 0 + elif start < 0: + start_offset = pc.add(start, pc.utf8_length(self._pa_array)) + start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset) + else: + start_offset = start + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + found = pc.not_equal(result, pa.scalar(-1, type=result.type)) + offset_result = pc.add(result, start_offset) + result = pc.if_else(found, offset_result, -1) + return self._convert_int_result(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d4aaef7eced83..861ec0c42c885 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2348,23 +2348,6 @@ def _str_fullmatch( pat = f"{pat}$" return self._str_match(pat, case, flags, na) - def _str_find(self, sub: str, start: int = 0, end: int | None = None): - if start != 0 and end is not None: - slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) - result = pc.find_substring(slices, sub) - not_found = pc.equal(result, -1) - start_offset = max(0, start) - offset_result = pc.add(result, start_offset) - result = pc.if_else(not_found, result, offset_result) - elif start == 0 and end is None: - slices = self._pa_array - result = pc.find_substring(slices, sub) - else: - raise NotImplementedError( - f"find not implemented with {sub=}, {start=}, {end=}" - ) - return type(self)(result) - def _str_join(self, sep: str): if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( self._pa_array.type diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index a3c2659beced0..563be79e98cbb 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -416,18 +416,14 @@ def _str_count(self, pat: str, flags: int = 0): return self._convert_int_result(result) def _str_find(self, sub: str, start: int = 0, end: int | None = None): - if start != 0 and end is not None: - slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) - result = pc.find_substring(slices, sub) - not_found = pc.equal(result, -1) - offset_result = pc.add(result, end - start) - result = pc.if_else(not_found, result, offset_result) - elif start == 0 and end is None: - slices = self._pa_array - result = pc.find_substring(slices, sub) - else: + if ( + pa_version_under13p0 + and not (start != 0 and end is not None) + and not (start == 0 and end is None) + ): + # GH#59562 return super()._str_find(sub, start, end) - return self._convert_int_result(result) + return ArrowStringArrayMixin._str_find(self, sub, start, end) def _str_get_dummies(self, sep: str = "|"): dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 47d13b331843c..12f3eedb6b9f1 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1925,10 +1925,56 @@ def test_str_find_negative_start(): tm.assert_series_equal(result, expected) -def test_str_find_notimplemented(): +def test_str_find_no_end(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) - with pytest.raises(NotImplementedError, match="find not implemented"): - ser.str.find("ab", start=1) + result = ser.str.find("ab", start=1) + expected = pd.Series([-1, None], dtype="int64[pyarrow]") + tm.assert_series_equal(result, expected) + + +def test_str_find_negative_start_negative_end(): + # GH 56791 + ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string())) + result = ser.str.find(sub="d", start=-6, end=-3) + expected = pd.Series([3, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) + + +def test_str_find_large_start(): + # GH 56791 + ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string())) + result = ser.str.find(sub="d", start=16) + expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) + + +@pytest.mark.skipif( + pa_version_under13p0, reason="https://github.com/apache/arrow/issues/36311" +) +@pytest.mark.parametrize("start", [-15, -3, 0, 1, 15, None]) +@pytest.mark.parametrize("end", [-15, -1, 0, 3, 15, None]) +@pytest.mark.parametrize("sub", ["", "az", "abce", "a", "caa"]) +def test_str_find_e2e(start, end, sub): + s = pd.Series( + ["abcaadef", "abc", "abcdeddefgj8292", "ab", "a", ""], + dtype=ArrowDtype(pa.string()), + ) + object_series = s.astype(pd.StringDtype(storage="python")) + result = s.str.find(sub, start, end) + expected = object_series.str.find(sub, start, end).astype(result.dtype) + tm.assert_series_equal(result, expected) + + arrow_str_series = s.astype(pd.StringDtype(storage="pyarrow")) + result2 = arrow_str_series.str.find(sub, start, end).astype(result.dtype) + tm.assert_series_equal(result2, expected) + + +def test_str_find_negative_start_negative_end_no_match(): + # GH 56791 + ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string())) + result = ser.str.find(sub="d", start=-3, end=-6) + expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( From ccb90e30bf7bf0d7172864e55781b0946702ab1a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 6 Sep 2024 19:37:42 +0200 Subject: [PATCH 18/34] String dtype: implement _get_common_dtype (#59682) * String dtype: implement _get_common_dtype * add specific tests * try fix typing * try fix typing * suppress typing error * support numpy 2.0 string * fix typo --- pandas/core/arrays/string_.py | 32 ++++++++- pandas/tests/arrays/categorical/test_api.py | 3 - pandas/tests/arrays/string_/test_concat.py | 73 +++++++++++++++++++++ 3 files changed, 103 insertions(+), 5 deletions(-) create mode 100644 pandas/tests/arrays/string_/test_concat.py diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c04ec13dbd81c..620d549204388 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -167,9 +167,9 @@ def __init__( # a consistent NaN value (and we can use `dtype.na_value is np.nan`) na_value = np.nan elif na_value is not libmissing.NA: - raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}") + raise ValueError(f"'na_value' must be np.nan or pd.NA, got {na_value}") - self.storage = storage + self.storage = cast(str, storage) self._na_value = na_value def __repr__(self) -> str: @@ -280,6 +280,34 @@ def construct_array_type( # type: ignore[override] else: return ArrowStringArrayNumpySemantics + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + storages = set() + na_values = set() + + for dtype in dtypes: + if isinstance(dtype, StringDtype): + storages.add(dtype.storage) + na_values.add(dtype.na_value) + elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "T"): + continue + else: + return None + + if len(storages) == 2: + # if both python and pyarrow storage -> priority to pyarrow + storage = "pyarrow" + else: + storage = next(iter(storages)) # type: ignore[assignment] + + na_value: libmissing.NAType | float + if len(na_values) == 2: + # if both NaN and NA -> priority to NA + na_value = libmissing.NA + else: + na_value = next(iter(na_values)) + + return StringDtype(storage=storage, na_value=na_value) + def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray ) -> BaseStringArray: diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 1d948b7495a43..a939ee5f6f53f 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import PY311 from pandas import ( @@ -158,7 +156,6 @@ def test_reorder_categories_raises(self, new_categories): with pytest.raises(ValueError, match=msg): cat.reorder_categories(new_categories) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() diff --git a/pandas/tests/arrays/string_/test_concat.py b/pandas/tests/arrays/string_/test_concat.py new file mode 100644 index 0000000000000..320d700b2b6c3 --- /dev/null +++ b/pandas/tests/arrays/string_/test_concat.py @@ -0,0 +1,73 @@ +import numpy as np +import pytest + +from pandas.compat import HAS_PYARROW + +from pandas.core.dtypes.cast import find_common_type + +import pandas as pd +import pandas._testing as tm +from pandas.util.version import Version + + +@pytest.mark.parametrize( + "to_concat_dtypes, result_dtype", + [ + # same types + ([("pyarrow", pd.NA), ("pyarrow", pd.NA)], ("pyarrow", pd.NA)), + ([("pyarrow", np.nan), ("pyarrow", np.nan)], ("pyarrow", np.nan)), + ([("python", pd.NA), ("python", pd.NA)], ("python", pd.NA)), + ([("python", np.nan), ("python", np.nan)], ("python", np.nan)), + # pyarrow preference + ([("pyarrow", pd.NA), ("python", pd.NA)], ("pyarrow", pd.NA)), + # NA preference + ([("python", pd.NA), ("python", np.nan)], ("python", pd.NA)), + ], +) +def test_concat_series(request, to_concat_dtypes, result_dtype): + if any(storage == "pyarrow" for storage, _ in to_concat_dtypes) and not HAS_PYARROW: + pytest.skip("Could not import 'pyarrow'") + + ser_list = [ + pd.Series(["a", "b", None], dtype=pd.StringDtype(storage, na_value)) + for storage, na_value in to_concat_dtypes + ] + + result = pd.concat(ser_list, ignore_index=True) + expected = pd.Series( + ["a", "b", None, "a", "b", None], dtype=pd.StringDtype(*result_dtype) + ) + tm.assert_series_equal(result, expected) + + # order doesn't matter for result + result = pd.concat(ser_list[::1], ignore_index=True) + tm.assert_series_equal(result, expected) + + +def test_concat_with_object(string_dtype_arguments): + # _get_common_dtype cannot inspect values, so object dtype with strings still + # results in object dtype + result = pd.concat( + [ + pd.Series(["a", "b", None], dtype=pd.StringDtype(*string_dtype_arguments)), + pd.Series(["a", "b", None], dtype=object), + ] + ) + assert result.dtype == np.dtype("object") + + +def test_concat_with_numpy(string_dtype_arguments): + # common type with a numpy string dtype always preserves the pandas string dtype + dtype = pd.StringDtype(*string_dtype_arguments) + assert find_common_type([dtype, np.dtype("U")]) == dtype + assert find_common_type([np.dtype("U"), dtype]) == dtype + assert find_common_type([dtype, np.dtype("U10")]) == dtype + assert find_common_type([np.dtype("U10"), dtype]) == dtype + + # with any other numpy dtype -> object + assert find_common_type([dtype, np.dtype("S")]) == np.dtype("object") + assert find_common_type([dtype, np.dtype("int64")]) == np.dtype("object") + + if Version(np.__version__) >= Version("2"): + assert find_common_type([dtype, np.dtypes.StringDType()]) == dtype + assert find_common_type([np.dtypes.StringDType(), dtype]) == dtype From 79dd74d36c9c1d8e645c793327587789cf4a7b0a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 9 Sep 2024 05:53:48 -0500 Subject: [PATCH 19/34] TST/BUG (string dtype): Fix and adjust indexes string tests (#59544) Co-authored-by: Joris Van den Bossche --- pandas/core/construction.py | 5 +++- pandas/core/indexes/base.py | 6 ++++- .../tests/indexes/base_class/test_setops.py | 6 ++--- pandas/tests/indexes/test_base.py | 11 ++------ pandas/tests/indexes/test_old_base.py | 26 ++++++++----------- 5 files changed, 24 insertions(+), 30 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 5bccca9cfbd47..584a1d417d198 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -609,7 +609,10 @@ def sanitize_array( dtype = StringDtype(na_value=np.nan) subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) - if subarr is data and copy: + if ( + subarr is data + or (subarr.dtype == "str" and subarr.dtype.storage == "python") # type: ignore[union-attr] + ) and copy: subarr = subarr.copy() else: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 825316585c03c..a28c98ecc5cee 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -506,7 +506,8 @@ def __new__( elif is_ea_or_datetimelike_dtype(dtype): # non-EA dtype indexes have special casting logic, so we punt here - pass + if isinstance(data, (set, frozenset)): + data = list(data) elif is_ea_or_datetimelike_dtype(data_dtype): pass @@ -6995,6 +6996,9 @@ def insert(self, loc: int, item) -> Index: # We cannot keep the same dtype, so cast to the (often object) # minimal shared dtype before doing the insert. dtype = self._find_common_type_compat(item) + if dtype == self.dtype: + # EA's might run into recursion errors if loc is invalid + raise return self.astype(dtype).insert(loc, item) if arr.dtype != object or not isinstance( diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 2176aa52b17f4..a897e5aca058a 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( Index, @@ -233,7 +231,6 @@ def test_tuple_union_bug(self, method, expected, sort): expected = Index(expected) tm.assert_index_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("first_list", [["b", "a"], []]) @pytest.mark.parametrize("second_list", [["a", "b"], []]) @pytest.mark.parametrize( @@ -243,6 +240,7 @@ def test_tuple_union_bug(self, method, expected, sort): def test_union_name_preservation( self, first_list, second_list, first_name, second_name, expected_name, sort ): + expected_dtype = object if not first_list or not second_list else "str" first = Index(first_list, name=first_name) second = Index(second_list, name=second_name) union = first.union(second, sort=sort) @@ -253,7 +251,7 @@ def test_union_name_preservation( expected = Index(sorted(vals), name=expected_name) tm.assert_index_equal(union, expected) else: - expected = Index(vals, name=expected_name) + expected = Index(vals, name=expected_name, dtype=expected_dtype) tm.assert_index_equal(union.sort_values(), expected.sort_values()) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index cf75f95d17b0a..813446440eded 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -76,9 +76,6 @@ def test_constructor_casting(self, index): tm.assert_contains_all(arr, new_index) tm.assert_index_equal(index, new_index) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_constructor_copy(self, using_infer_string): index = Index(list("abc"), name="name") arr = np.array(index) @@ -346,11 +343,6 @@ def test_constructor_empty_special(self, empty, klass): def test_view_with_args(self, index): index.view("i8") - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) @pytest.mark.parametrize( "index", [ @@ -367,7 +359,8 @@ def test_view_with_args_object_array_raises(self, index): msg = "When changing to a larger dtype" with pytest.raises(ValueError, match=msg): index.view("i8") - elif index.dtype == "string": + elif index.dtype == "str" and not index.dtype.storage == "python": + # TODO(infer_string): Make the errors consistent with pytest.raises(NotImplementedError, match="i8"): index.view("i8") else: diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index c17d4f54c36c5..37aa01ea046ca 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -6,10 +6,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp -from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import ( is_integer_dtype, @@ -28,6 +25,7 @@ PeriodIndex, RangeIndex, Series, + StringDtype, TimedeltaIndex, isna, period_range, @@ -233,7 +231,6 @@ def test_logical_compat(self, simple_index): with pytest.raises(TypeError, match=msg): idx.any() - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_repr_roundtrip(self, simple_index): if isinstance(simple_index, IntervalIndex): pytest.skip(f"Not a valid repr for {type(simple_index).__name__}") @@ -250,11 +247,6 @@ def test_repr_max_seq_item_setting(self, simple_index): repr(idx) assert "..." not in str(idx) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_ensure_copied_data(self, index): # Check the "copy" argument of each Index.__new__ is honoured @@ -302,7 +294,9 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._mask, result._values._mask, check_same="same" ) - elif index.dtype == "string[python]": + elif ( + isinstance(index.dtype, StringDtype) and index.dtype.storage == "python" + ): assert np.shares_memory(index._values._ndarray, result._values._ndarray) tm.assert_numpy_array_equal( index._values._ndarray, result._values._ndarray, check_same="same" @@ -432,11 +426,7 @@ def test_insert_base(self, index): result = trimmed.insert(0, index[0]) assert index[0:4].equals(result) - @pytest.mark.skipif( - using_string_dtype(), - reason="completely different behavior, tested elsewher", - ) - def test_insert_out_of_bounds(self, index): + def test_insert_out_of_bounds(self, index, using_infer_string): # TypeError/IndexError matches what np.insert raises in these cases if len(index) > 0: @@ -448,6 +438,12 @@ def test_insert_out_of_bounds(self, index): msg = "index (0|0.5) is out of bounds for axis 0 with size 0" else: msg = "slice indices must be integers or None or have an __index__ method" + + if using_infer_string and ( + index.dtype == "string" or index.dtype == "category" # noqa: PLR1714 + ): + msg = "loc must be an integer between" + with pytest.raises(err, match=msg): index.insert(0.5, "foo") From 743c682836dd028e2a654c3564d8b0362386206d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 9 Sep 2024 06:40:22 -0500 Subject: [PATCH 20/34] TST (string dtype): Adjust indexing string tests (#59541) Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_.py | 4 ++ pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/arrays/string_/test_string.py | 5 +-- pandas/tests/indexing/test_iloc.py | 31 +++++++------- pandas/tests/indexing/test_indexing.py | 18 ++++----- pandas/tests/indexing/test_loc.py | 47 ++++++++++++++-------- 6 files changed, 59 insertions(+), 48 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 620d549204388..43c46a4308f9e 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -713,6 +713,10 @@ def __setitem__(self, key, value) -> None: else: if not is_array_like(value): value = np.asarray(value, dtype=object) + else: + # cast categories and friends to arrays to see if values are + # compatible, compatibility with arrow backed strings + value = np.asarray(value) if len(value) and not lib.is_string_array(value, skipna=True): raise TypeError("Must provide strings.") diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 563be79e98cbb..5ed12e7352bd1 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -231,7 +231,7 @@ def _maybe_convert_setitem_value(self, value): value[isna(value)] = None for v in value: if not (v is None or isinstance(v, str)): - raise TypeError("Scalar must be NA or str") + raise TypeError("Must provide strings") return super()._maybe_convert_setitem_value(value) def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 4c53dabcdbf7a..d3a0897f88f61 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -101,10 +101,7 @@ def test_setitem_validates(cls, dtype): with pytest.raises(TypeError, match=msg): arr[0] = 10 - if dtype.storage == "python": - msg = "Must provide strings." - else: - msg = "Scalar must be NA or str" + msg = "Must provide strings" with pytest.raises(TypeError, match=msg): arr[:] = np.array([1, 2]) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 3fd9498e21a73..45f63bdf1ee32 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IndexingError import pandas.util._test_decorators as td @@ -1218,22 +1216,25 @@ def test_iloc_getitem_int_single_ea_block_view(self): arr[2] = arr[-1] assert ser[0] == arr[-1] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_iloc_setitem_multicolumn_to_datetime(self): + def test_iloc_setitem_multicolumn_to_datetime(self, using_infer_string): # GH#20511 df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]}) - df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) - expected = DataFrame( - { - "A": [ - Timestamp("2021-01-01 00:00:00"), - Timestamp("2022-01-01 00:00:00"), - ], - "B": ["2021", "2022"], - } - ) - tm.assert_frame_equal(df, expected, check_dtype=False) + if using_infer_string: + with pytest.raises(TypeError, match="Invalid value"): + df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) + else: + df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) + expected = DataFrame( + { + "A": [ + Timestamp("2021-01-01 00:00:00"), + Timestamp("2022-01-01 00:00:00"), + ], + "B": ["2021", "2022"], + } + ) + tm.assert_frame_equal(df, expected, check_dtype=False) class TestILocErrors: diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index e57598cfc2be1..0ff33ba88b16f 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IndexingError from pandas.core.dtypes.common import ( @@ -563,12 +561,12 @@ def test_string_slice_empty(self): with pytest.raises(KeyError, match="^0$"): df.loc["2011", 0] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_astype_assignment(self, using_infer_string): # GH4312 (iloc) df_orig = DataFrame( [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) + df_orig[list("ABCDG")] = df_orig[list("ABCDG")].astype(object) df = df_orig.copy() @@ -578,9 +576,9 @@ def test_astype_assignment(self, using_infer_string): expected = DataFrame( [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["A"] = expected["A"].astype(object) - expected["B"] = expected["B"].astype(object) + expected[list("CDG")] = expected[list("CDG")].astype(object) + expected["A"] = expected["A"].astype(object) + expected["B"] = expected["B"].astype(object) tm.assert_frame_equal(df, expected) # GH5702 (loc) @@ -589,18 +587,16 @@ def test_astype_assignment(self, using_infer_string): expected = DataFrame( [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["A"] = expected["A"].astype(object) + expected[list("ABCDG")] = expected[list("ABCDG")].astype(object) tm.assert_frame_equal(df, expected) df = df_orig.copy() + df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) expected = DataFrame( [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["B"] = expected["B"].astype(object) - expected["C"] = expected["C"].astype(object) + expected[list("ABCDG")] = expected[list("ABCDG")].astype(object) tm.assert_frame_equal(df, expected) def test_astype_assignment_full_replacements(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index d61b2ea642439..ad72be02f81b1 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1,5 +1,6 @@ """ test label based indexing with loc """ from collections import namedtuple +import contextlib from datetime import ( date, datetime, @@ -648,8 +649,9 @@ def test_loc_setitem_consistency_empty(self): expected["x"] = expected["x"].astype(np.int64) tm.assert_frame_equal(df, expected) + # incompatible dtype warning @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_loc_setitem_consistency_slice_column_len(self): + def test_loc_setitem_consistency_slice_column_len(self, using_infer_string): # .loc[:,column] setting with slice == len of the column # GH10408 levels = [ @@ -673,13 +675,24 @@ def test_loc_setitem_consistency_slice_column_len(self): ] df = DataFrame(values, index=mi, columns=cols) - df.loc[:, ("Respondent", "StartDate")] = to_datetime( - df.loc[:, ("Respondent", "StartDate")] - ) - df.loc[:, ("Respondent", "EndDate")] = to_datetime( - df.loc[:, ("Respondent", "EndDate")] - ) - df = df.infer_objects(copy=False) + ctx = contextlib.nullcontext() + if using_infer_string: + ctx = pytest.raises(TypeError, match="Invalid value") + + with ctx: + df.loc[:, ("Respondent", "StartDate")] = to_datetime( + df.loc[:, ("Respondent", "StartDate")] + ) + with ctx: + df.loc[:, ("Respondent", "EndDate")] = to_datetime( + df.loc[:, ("Respondent", "EndDate")] + ) + + if using_infer_string: + # infer-objects won't infer stuff anymore + return + + df = df.infer_objects() # Adding a new key df.loc[:, ("Respondent", "Duration")] = ( @@ -1269,20 +1282,23 @@ def test_loc_reverse_assignment(self): tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string") - def test_loc_setitem_str_to_small_float_conversion_type(self): + def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string): # GH#20388 col_data = [str(np.random.default_rng(2).random() * 1e-12) for _ in range(5)] result = DataFrame(col_data, columns=["A"]) - expected = DataFrame(col_data, columns=["A"], dtype=object) + expected = DataFrame(col_data, columns=["A"]) tm.assert_frame_equal(result, expected) # assigning with loc/iloc attempts to set the values inplace, which # in this case is successful - result.loc[result.index, "A"] = [float(x) for x in col_data] - expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object) - tm.assert_frame_equal(result, expected) + if using_infer_string: + with pytest.raises(TypeError, match="Must provide strings"): + result.loc[result.index, "A"] = [float(x) for x in col_data] + else: + result.loc[result.index, "A"] = [float(x) for x in col_data] + expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object) + tm.assert_frame_equal(result, expected) # assigning the entire column using __setitem__ swaps in the new array # GH#??? @@ -1458,9 +1474,6 @@ def test_loc_setitem_categorical_values_partial_column_slice(self): df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_loc_setitem_single_row_categorical(self, using_infer_string): # GH#25495 df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) From bf47ce61881d858f725597562744bf2800513da7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 19:15:02 +0200 Subject: [PATCH 21/34] TST (string dtype): adjust pandas/tests/reshape tests (#59762) --- pandas/tests/reshape/concat/test_concat.py | 2 ++ pandas/tests/reshape/merge/test_merge_asof.py | 10 ++----- pandas/tests/reshape/test_get_dummies.py | 10 ++----- pandas/tests/reshape/test_melt.py | 25 ++++++----------- pandas/tests/reshape/test_pivot.py | 28 ++++++++++++------- 5 files changed, 34 insertions(+), 41 deletions(-) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 2a52d3060e4b9..77c45cf36894b 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -46,6 +46,7 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] + # test is not written to work with string dtype (checks .base) @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_copy(self, using_array_manager, using_copy_on_write): df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) @@ -80,6 +81,7 @@ def test_concat_copy(self, using_array_manager, using_copy_on_write): assert arr is df3._mgr.arrays[0] else: assert arr.base is not None + assert arr.base is not None # Float block was consolidated. df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1))) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 11e29f4e10dc4..77a3d64415ace 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -4,8 +4,6 @@ import pytest import pytz -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -3083,12 +3081,8 @@ def test_on_float_by_int(self): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_merge_datatype_error_raises(self, using_infer_string): - if using_infer_string: - msg = "incompatible merge keys" - else: - msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" + def test_merge_datatype_error_raises(self): + msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]}) right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]}) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 2c17b7f6a5a47..324d2a6cfd419 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -216,11 +214,10 @@ def test_dataframe_dummies_all_obj(self, df, sparse): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_dataframe_dummies_string_dtype(self, df, using_infer_string): + def test_dataframe_dummies_string_dtype(self, df, any_string_dtype): # GH44965 df = df[["A", "B"]] - df = df.astype({"A": "object", "B": "string"}) + df = df.astype({"A": "str", "B": any_string_dtype}) result = get_dummies(df) expected = DataFrame( { @@ -231,8 +228,7 @@ def test_dataframe_dummies_string_dtype(self, df, using_infer_string): }, dtype=bool, ) - if not using_infer_string: - # infer_string returns numpy bools + if any_string_dtype == "string" and any_string_dtype.na_value is pd.NA: expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index cbe2c9b931ee3..944e61896a182 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -21,7 +19,7 @@ def df(): res = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) res["id1"] = (res["A"] > 0).astype(np.int64) @@ -83,7 +81,6 @@ def test_default_col_names(self, df): result2 = df.melt(id_vars=["id1", "id2"]) assert result2.columns.tolist() == ["id1", "id2", "variable", "value"] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_value_vars(self, df): result3 = df.melt(id_vars=["id1", "id2"], value_vars="A") assert len(result3) == 10 @@ -100,7 +97,6 @@ def test_value_vars(self, df): ) tm.assert_frame_equal(result4, expected4) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("type_", (tuple, list, np.array)) def test_value_vars_types(self, type_, df): # GH 15348 @@ -181,7 +177,6 @@ def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1): with pytest.raises(ValueError, match=msg): df1.melt(id_vars=id_vars, value_vars=value_vars) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_name(self, df, var_name): result5 = df.melt(var_name=var_name) assert result5.columns.tolist() == ["var", "value"] @@ -209,7 +204,6 @@ def test_custom_var_name(self, df, var_name): ) tm.assert_frame_equal(result9, expected9) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_value_name(self, df, value_name): result10 = df.melt(value_name=value_name) assert result10.columns.tolist() == ["variable", "val"] @@ -239,7 +233,6 @@ def test_custom_value_name(self, df, value_name): ) tm.assert_frame_equal(result14, expected14) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_and_value_name(self, df, value_name, var_name): result15 = df.melt(var_name=var_name, value_name=value_name) assert result15.columns.tolist() == ["var", "val"] @@ -364,7 +357,6 @@ def test_melt_missing_columns_raises(self): with pytest.raises(KeyError, match=msg): multi.melt(["A"], ["F"], col_level=0) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_melt_mixed_int_str_id_vars(self): # GH 29718 df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]}) @@ -372,6 +364,8 @@ def test_melt_mixed_int_str_id_vars(self): expected = DataFrame( {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]} ) + # the df's columns are mixed type and thus object -> preserves object dtype + expected["variable"] = expected["variable"].astype(object) tm.assert_frame_equal(result, expected) def test_melt_mixed_int_str_value_vars(self): @@ -1205,12 +1199,10 @@ def test_raise_of_column_name_value(self): ): df.melt(id_vars="value", value_name="value") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - @pytest.mark.parametrize("dtype", ["O", "string"]) - def test_missing_stubname(self, dtype): + def test_missing_stubname(self, any_string_dtype): # GH46044 df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]}) - df = df.astype({"id": dtype}) + df = df.astype({"id": any_string_dtype}) result = wide_to_long( df, stubnames=["a", "b"], @@ -1226,12 +1218,13 @@ def test_missing_stubname(self, dtype): {"a": [100, 200, 300, 400], "b": [np.nan] * 4}, index=index, ) - new_level = expected.index.levels[0].astype(dtype) + new_level = expected.index.levels[0].astype(any_string_dtype) + if any_string_dtype == "object": + new_level = expected.index.levels[0].astype("str") expected.index = expected.index.set_levels(new_level, level=0) tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wide_to_long_pyarrow_string_columns(): # GH 57066 pytest.importorskip("pyarrow") @@ -1250,7 +1243,7 @@ def test_wide_to_long_pyarrow_string_columns(): ) expected = DataFrame( [[1, 1], [1, 1], [1, 2]], - columns=Index(["D", "R"], dtype=object), + columns=Index(["D", "R"]), index=pd.MultiIndex.from_arrays( [ [1, 1, 1], diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 9aa13d59a586b..d0858a0ea5558 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1081,7 +1081,6 @@ def test_margins_dtype_len(self, data): tm.assert_frame_equal(expected, result) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) def test_pivot_table_multiindex_only(self, cols): # GH 17038 @@ -1091,7 +1090,7 @@ def test_pivot_table_multiindex_only(self, cols): expected = DataFrame( [[4.0, 5.0, 6.0]], columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), - index=Index(["v"], dtype=object), + index=Index(["v"], dtype="str" if cols == ("a", "b") else "object"), ) tm.assert_frame_equal(result, expected) @@ -2525,13 +2524,16 @@ def test_pivot_empty(self): expected = DataFrame(index=[], columns=[]) tm.assert_frame_equal(result, expected, check_names=False) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - @pytest.mark.parametrize("dtype", [object, "string"]) - def test_pivot_integer_bug(self, dtype): - df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=dtype) + def test_pivot_integer_bug(self, any_string_dtype): + df = DataFrame( + data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=any_string_dtype + ) result = df.pivot(index=1, columns=0, values=2) - tm.assert_index_equal(result.columns, Index(["A", "B"], name=0, dtype=dtype)) + expected_columns = Index(["A", "B"], name=0, dtype=any_string_dtype) + if any_string_dtype == "object": + expected_columns = expected_columns.astype("str") + tm.assert_index_equal(result.columns, expected_columns) def test_pivot_index_none(self): # GH#3962 @@ -2613,7 +2615,9 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() # pylint: disable=missing-kwoa - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_columns_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2629,7 +2633,9 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_index_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2643,7 +2649,9 @@ def test_pivot_index_is_none(self): expected = DataFrame(3, index=[1], columns=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) From 74c6fac99ee4f6d582f32ee35a50fc6b903e436a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 22:21:36 +0200 Subject: [PATCH 22/34] BUG (string dtype): fix inplace mutation with copy=False in ensure_string_array (#59756) * BUG (string dtype): fix inplace mutation with copy=False in ensure_string_array * update --- pandas/_libs/lib.pyx | 19 ++++++++++++++----- pandas/tests/copy_view/test_astype.py | 22 +++++++++++++++++++++- pandas/tests/libs/test_lib.py | 14 ++++++++++++++ 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5d8a04664b0e4..d93099cd79d1b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -736,7 +736,9 @@ cpdef ndarray[object] ensure_string_array( convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. copy : bool, default True - Whether to ensure that a new array is returned. + Whether to ensure that a new array is returned. When True, a new array + is always returned. When False, a new array is only returned when needed + to avoid mutating the input array. skipna : bool, default True Whether or not to coerce nulls to their stringified form (e.g. if False, NaN becomes 'nan'). @@ -765,10 +767,17 @@ cpdef ndarray[object] ensure_string_array( result = np.asarray(arr, dtype="object") - if copy and (result is arr or np.shares_memory(arr, result)): - # GH#54654 - result = result.copy() - elif not copy and result is arr: + if result is arr or np.may_share_memory(arr, result): + # if np.asarray(..) did not make a copy of the input arr, we still need + # to do that to avoid mutating the input array + # GH#54654: share_memory check is needed for rare cases where np.asarray + # returns a new object without making a copy of the actual data + if copy: + result = result.copy() + else: + already_copied = False + elif not copy and not result.flags.writeable: + # Weird edge case where result is a view already_copied = False if issubclass(arr.dtype.type, np.str_): diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index fb82329d5b50d..e0e3f6dc058a4 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -135,7 +135,8 @@ def test_astype_string_and_object_update_original( tm.assert_frame_equal(df2, df_orig) -def test_astype_string_copy_on_pickle_roundrip(): +def test_astype_str_copy_on_pickle_roundrip(): + # TODO(infer_string) this test can be removed after 3.0 (once str is the default) # https://github.com/pandas-dev/pandas/issues/54654 # ensure_string_array may alter array inplace base = Series(np.array([(1, 2), None, 1], dtype="object")) @@ -144,6 +145,25 @@ def test_astype_string_copy_on_pickle_roundrip(): tm.assert_series_equal(base, base_copy) +def test_astype_string_copy_on_pickle_roundrip(any_string_dtype): + # https://github.com/pandas-dev/pandas/issues/54654 + # ensure_string_array may alter array inplace + base = Series(np.array([(1, 2), None, 1], dtype="object")) + base_copy = pickle.loads(pickle.dumps(base)) + base_copy.astype(any_string_dtype) + tm.assert_series_equal(base, base_copy) + + +def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype): + # https://github.com/pandas-dev/pandas/issues/54654 + # ensure_string_array may alter read-only array inplace + base = Series(np.array([(1, 2), None, 1], dtype="object")) + base_copy = pickle.loads(pickle.dumps(base)) + base_copy._values.flags.writeable = False + base_copy.astype(any_string_dtype) + tm.assert_series_equal(base, base_copy) + + def test_astype_dict_dtypes(using_copy_on_write): df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")} diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 8583d8bcc052c..17dae1879f3b8 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -1,3 +1,5 @@ +import pickle + import numpy as np import pytest @@ -283,3 +285,15 @@ def test_no_default_pickle(): # GH#40397 obj = tm.round_trip_pickle(lib.no_default) assert obj is lib.no_default + + +def test_ensure_string_array_copy(): + # ensure the original array is not modified in case of copy=False with + # pickle-roundtripped object dtype array + # https://github.com/pandas-dev/pandas/issues/54654 + arr = np.array(["a", None], dtype=object) + arr = pickle.loads(pickle.dumps(arr)) + result = lib.ensure_string_array(arr, copy=False) + assert not np.shares_memory(arr, result) + assert arr[1] is None + assert result[1] is np.nan From ca24b425cc9da4053f1abcce945cda6f390e7176 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 22:34:28 +0200 Subject: [PATCH 23/34] TST (string dtype): remove usage of 'string[pyarrow_numpy]' alias (#59758) --- pandas/conftest.py | 28 +++++++++++++++++++ pandas/tests/apply/test_numba.py | 6 ++-- .../tests/arrays/string_/test_string_arrow.py | 5 ++-- pandas/tests/base/test_misc.py | 4 +-- pandas/tests/frame/indexing/test_indexing.py | 10 ++----- pandas/tests/frame/methods/test_rank.py | 14 +++++----- pandas/tests/frame/test_constructors.py | 7 ++--- pandas/tests/groupby/methods/test_size.py | 13 ++------- .../groupby/methods/test_value_counts.py | 14 ++-------- pandas/tests/groupby/test_groupby.py | 11 ++------ pandas/tests/groupby/test_reductions.py | 5 ++-- .../indexes/base_class/test_constructors.py | 4 +-- .../tests/indexes/base_class/test_reshape.py | 7 ++--- pandas/tests/indexes/object/test_indexing.py | 23 ++++----------- pandas/tests/indexes/test_base.py | 5 ++-- pandas/tests/indexes/test_old_base.py | 5 +++- pandas/tests/interchange/test_impl.py | 8 ++++-- pandas/tests/io/json/test_pandas.py | 8 +++--- .../io/parser/dtypes/test_dtypes_basic.py | 11 +++----- pandas/tests/io/pytables/test_read.py | 5 ++-- pandas/tests/io/test_feather.py | 4 ++- pandas/tests/io/test_orc.py | 4 +-- pandas/tests/io/test_parquet.py | 8 +++--- pandas/tests/io/test_sql.py | 3 +- pandas/tests/reshape/test_get_dummies.py | 22 +++++++-------- pandas/tests/reshape/test_melt.py | 8 +++--- pandas/tests/series/test_logical_ops.py | 3 +- pandas/tests/strings/test_find_replace.py | 2 +- pandas/tests/util/test_shares_memory.py | 6 ++-- 29 files changed, 119 insertions(+), 134 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 433ea7275223d..f957289ea52e8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1228,6 +1228,34 @@ def string_dtype(request): return request.param +@pytest.fixture( + params=[ + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), + ], + ids=[ + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], +) +def string_dtype_no_object(request): + """ + Parametrized fixture for string dtypes. + * 'string[python]' (NA variant) + * 'string[pyarrow]' (NA variant) + * 'str' (NaN variant, with pyarrow) + * 'str' (NaN variant, without pyarrow) + """ + # need to instantiate the StringDtype here instead of in the params + # to avoid importing pyarrow during test collection + storage, na_value = request.param + return pd.StringDtype(storage, na_value) + + @pytest.fixture( params=[ "string[python]", diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 6bbe5100e8826..83b655f89e247 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -5,6 +5,7 @@ import pandas.util._test_decorators as td +import pandas as pd from pandas import ( DataFrame, Index, @@ -29,11 +30,10 @@ def test_numba_vs_python_noop(float_frame, apply_axis): def test_numba_vs_python_string_index(): # GH#56189 - pytest.importorskip("pyarrow") df = DataFrame( 1, - index=Index(["a", "b"], dtype="string[pyarrow_numpy]"), - columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)), ) func = lambda x: x result = df.apply(func, engine="numba", axis=0) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index e6957feecf4b5..2f3840e92b62a 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -241,10 +241,11 @@ def test_setitem_invalid_indexer_raises(): arr[[0, 1]] = ["foo", "bar", "baz"] -@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"]) -def test_pickle_roundtrip(dtype): +@pytest.mark.parametrize("na_value", [pd.NA, np.nan]) +def test_pickle_roundtrip(na_value): # GH 42600 pytest.importorskip("pyarrow") + dtype = StringDtype("pyarrow", na_value=na_value) expected = pd.Series(range(10), dtype=dtype) expected_sliced = expected.head(2) full_pickled = pickle.dumps(expected) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 3e0d8b1afedc0..b42e01c76335c 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -180,9 +180,7 @@ def test_access_by_position(index_flat): assert index[-1] == index[size - 1] msg = f"index {size} is out of bounds for axis 0 with size {size}" - if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal( - index.dtype, "string[pyarrow_numpy]" - ): + if isinstance(index.dtype, pd.StringDtype) and index.dtype.storage == "pyarrow": msg = "index out of bounds" with pytest.raises(IndexError, match=msg): index[size] diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 7a7586961deca..04dba325f060f 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1955,13 +1955,11 @@ def test_adding_new_conditional_column() -> None: ("dtype", "infer_string"), [ (object, False), - ("string[pyarrow_numpy]", True), + (pd.StringDtype(na_value=np.nan), True), ], ) def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: # https://github.com/pandas-dev/pandas/issues/56204 - pytest.importorskip("pyarrow") - df = DataFrame({"a": [1, 2], "b": [3, 4]}) with pd.option_context("future.infer_string", infer_string): df.loc[df["a"] == 1, "c"] = "1" @@ -1971,16 +1969,14 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: tm.assert_frame_equal(df, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_new_column_infer_string(): # GH#55366 - pytest.importorskip("pyarrow") df = DataFrame({"x": [1]}) with pd.option_context("future.infer_string", True): df.loc[df["x"] == 1, "y"] = "1" expected = DataFrame( - {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, - columns=Index(["x", "y"], dtype=object), + {"x": [1], "y": Series(["1"], dtype=pd.StringDtype(na_value=np.nan))}, + columns=Index(["x", "y"], dtype="str"), ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index edba971408d04..82722eeb1af72 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -14,6 +14,7 @@ ) from pandas.compat import HAS_PYARROW +import pandas as pd from pandas import ( DataFrame, Index, @@ -509,14 +510,13 @@ def test_rank_mixed_axis_zero(self, data, expected): result = df.rank(numeric_only=True) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "dtype, exp_dtype", - [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], - ) - def test_rank_string_dtype(self, dtype, exp_dtype): + def test_rank_string_dtype(self, string_dtype_no_object): # GH#55362 - pytest.importorskip("pyarrow") - obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object) result = obj.rank(method="first") + exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64" + if string_dtype_no_object.storage == "python": + # TODO nullable string[python] should also return nullable Int64 + exp_dtype = "float64" expected = Series([1, 2, None, 3], dtype=exp_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 86d9dc0c7fbdc..f70d36d110625 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2721,8 +2721,7 @@ def test_construct_with_strings_and_none(self): def test_frame_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2756,8 +2755,7 @@ def test_frame_string_inference(self): def test_frame_string_inference_array_string_dtype(self): # GH#54496 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2781,7 +2779,6 @@ def test_frame_string_inference_array_string_dtype(self): def test_frame_string_inference_block_dim(self): # GH#55363 - pytest.importorskip("pyarrow") with pd.option_context("future.infer_string", True): df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) assert df._mgr.blocks[0].ndim == 2 diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index 5b4c08fc24411..fb834ee2a8799 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -3,8 +3,6 @@ from pandas._config import using_string_dtype -import pandas.util._test_decorators as td - from pandas.core.dtypes.common import is_integer_dtype from pandas import ( @@ -111,16 +109,9 @@ def test_size_series_masked_type_returns_Int64(dtype): @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_size_strings(dtype): +def test_size_strings(any_string_dtype): # GH#55627 + dtype = any_string_dtype df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) result = df.groupby("a")["b"].size() exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index dc986d046ca41..d8c6c7c3fe50c 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -8,8 +8,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( Categorical, CategoricalIndex, @@ -389,14 +387,6 @@ def test_against_frame_and_seriesgroupby( tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize( "sort, ascending, expected_rows, expected_count, expected_group_size", @@ -414,9 +404,10 @@ def test_compound( expected_rows, expected_count, expected_group_size, - dtype, + any_string_dtype, using_infer_string, ): + dtype = any_string_dtype education_df = education_df.astype(dtype) education_df.columns = education_df.columns.astype(dtype) # Multiple groupby keys and as_index=False @@ -433,6 +424,7 @@ def test_compound( expected["proportion"] = expected_count expected["proportion"] /= expected_group_size if dtype == "string[pyarrow]": + # TODO(nullable) also string[python] should return nullable dtypes expected["proportion"] = expected["proportion"].convert_dtypes() else: expected["count"] = expected_count diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 015a9db32883b..586ef8a126536 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2832,20 +2832,13 @@ def test_rolling_wrong_param_min_period(): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_by_column_values_with_same_starting_value(dtype): +def test_by_column_values_with_same_starting_value(any_string_dtype): # GH29635 df = DataFrame( { "Name": ["Thomas", "Thomas", "Thomas John"], "Credit": [1200, 1300, 900], - "Mood": Series(["sad", "happy", "happy"], dtype=dtype), + "Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype), } ) aggregate_details = {"Mood": Series.mode, "Credit": "sum"} diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index f67051de6e8c7..8e1bbcb43e3f3 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -702,10 +702,9 @@ def test_groupby_min_max_categorical(func): @pytest.mark.parametrize("func", ["min", "max"]) -def test_min_empty_string_dtype(func): +def test_min_empty_string_dtype(func, string_dtype_no_object): # GH#55619 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = string_dtype_no_object df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0] result = getattr(df.groupby("a"), func)() expected = DataFrame( diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 338509dd239e6..dcf0165ead6c0 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -47,9 +47,7 @@ def test_construct_empty_tuples(self, tuple_list): def test_index_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Index(["a", "b"], dtype=dtype) + expected = Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)) with pd.option_context("future.infer_string", True): ser = Index(["a", "b"]) tm.assert_index_equal(ser, expected) diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 6a544e448ebe1..b1a6c30b52f68 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -59,12 +59,11 @@ def test_insert_datetime_into_object(self, loc, val): tm.assert_index_equal(result, expected) assert type(expected[2]) is type(val) - def test_insert_none_into_string_numpy(self): + def test_insert_none_into_string_numpy(self, string_dtype_no_object): # GH#55365 - pytest.importorskip("pyarrow") - index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]") + index = Index(["a", "b", "c"], dtype=string_dtype_no_object) result = index.insert(-1, None) - expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]") + expected = Index(["a", "b", None, "c"], dtype=string_dtype_no_object) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 322e6677fe05d..57e5c5e3b6abb 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -7,7 +7,6 @@ NA, is_matching_na, ) -import pandas.util._test_decorators as td import pandas as pd from pandas import Index @@ -159,14 +158,6 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: - # TODO(infer_string) parametrize over multiple string dtypes - @pytest.mark.parametrize( - "dtype", - [ - "object", - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ], - ) @pytest.mark.parametrize( "in_slice,expected", [ @@ -190,24 +181,22 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected, dtype): - index = Index(list("bcdxy"), dtype=dtype) + def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected), dtype=dtype) + expected = Index(list(expected), dtype=any_string_dtype) tm.assert_index_equal(result, expected) - # TODO(infer_string) parametrize over multiple string dtypes - @td.skip_if_no("pyarrow") - def test_slice_locs_negative_step_oob(self): - index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") + def test_slice_locs_negative_step_oob(self, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) result = index[-10:5:1] tm.assert_index_equal(result, index) result = index[4:-10:-1] - expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]") + expected = Index(list("yxdcb"), dtype=any_string_dtype) tm.assert_index_equal(result, expected) def test_slice_locs_dup(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 813446440eded..3bcc62445f0ac 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -971,10 +971,9 @@ def test_isin_empty(self, empty): result = index.isin(empty) tm.assert_numpy_array_equal(expected, result) - @td.skip_if_no("pyarrow") - def test_isin_arrow_string_null(self): + def test_isin_string_null(self, string_dtype_no_object): # GH#55821 - index = Index(["a", "b"], dtype="string[pyarrow_numpy]") + index = Index(["a", "b"], dtype=string_dtype_no_object) result = index.isin([None]) expected = np.array([False, False]) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 37aa01ea046ca..176bf893cafa8 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -301,7 +301,10 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._ndarray, result._values._ndarray, check_same="same" ) - elif index.dtype in ("string[pyarrow]", "string[pyarrow_numpy]"): + elif ( + isinstance(index.dtype, StringDtype) + and index.dtype.storage == "pyarrow" + ): assert tm.shares_memory(result._values, index._values) else: raise NotImplementedError(index.dtype) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index d1a15dc93f702..b3af8def191ec 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -476,7 +476,7 @@ def test_non_str_names_w_duplicates(): ([1.0, 2.25, None], "Float32[pyarrow]", "float32"), ([True, False, None], "boolean", "bool"), ([True, False, None], "boolean[pyarrow]", "bool"), - (["much ado", "about", None], "string[pyarrow_numpy]", "large_string"), + (["much ado", "about", None], pd.StringDtype(na_value=np.nan), "large_string"), (["much ado", "about", None], "string[pyarrow]", "large_string"), ( [datetime(2020, 1, 1), datetime(2020, 1, 2), None], @@ -539,7 +539,11 @@ def test_pandas_nullable_with_missing_values( ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"), ([True, False, False], "boolean", "bool"), ([True, False, False], "boolean[pyarrow]", "bool"), - (["much ado", "about", "nothing"], "string[pyarrow_numpy]", "large_string"), + ( + ["much ado", "about", "nothing"], + pd.StringDtype(na_value=np.nan), + "large_string", + ), (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"), ( [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index de40441fe25dd..a8608434be5ee 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2139,18 +2139,18 @@ def test_pyarrow_engine_lines_false(): def test_json_roundtrip_string_inference(orient): - pytest.importorskip("pyarrow") df = DataFrame( [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"] ) out = df.to_json() with pd.option_context("future.infer_string", True): result = read_json(StringIO(out)) + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( [["a", "b"], ["c", "d"]], - dtype="string[pyarrow_numpy]", - index=Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"), - columns=Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"), + dtype=dtype, + index=Index(["row 1", "row 2"], dtype=dtype), + columns=Index(["col 1", "col 2"], dtype=dtype), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 800ece5a409e1..bc7b21baaeec5 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -547,8 +547,7 @@ def test_ea_int_avoid_overflow(all_parsers): def test_string_inference(all_parsers): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) data = """a,b x,1 @@ -568,8 +567,6 @@ def test_string_inference(all_parsers): @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) def test_string_inference_object_dtype(all_parsers, dtype): # GH#56047 - pytest.importorskip("pyarrow") - data = """a,b x,a y,a @@ -583,7 +580,7 @@ def test_string_inference_object_dtype(all_parsers, dtype): "a": pd.Series(["x", "y", "z"], dtype=object), "b": pd.Series(["a", "a", "a"], dtype=object), }, - columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) @@ -593,9 +590,9 @@ def test_string_inference_object_dtype(all_parsers, dtype): expected = DataFrame( { "a": pd.Series(["x", "y", "z"], dtype=object), - "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"), + "b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)), }, - columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index a04f02f0e052b..28cd8aea1defc 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -403,7 +403,6 @@ def test_read_py2_hdf_file_in_py3(datapath): def test_read_infer_string(tmp_path, setup_path): # GH#54431 - pytest.importorskip("pyarrow") df = DataFrame({"a": ["a", "b", None]}) path = tmp_path / setup_path df.to_hdf(path, key="data", format="table") @@ -411,7 +410,7 @@ def test_read_infer_string(tmp_path, setup_path): result = read_hdf(path, key="data", mode="r") expected = DataFrame( {"a": ["a", "b", None]}, - dtype="string[pyarrow_numpy]", - columns=Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 57e12747a3746..24fc801de44a7 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -249,5 +249,7 @@ def test_string_inference(self, tmp_path): df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) - expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]") + expected = pd.DataFrame( + data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan) + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index d2204a9134f90..4c4d7461e4ac5 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -438,7 +438,7 @@ def test_string_inference(tmp_path): result = read_orc(path) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype="string[pyarrow_numpy]", - columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 578c0949a6c97..746ca3cf6534d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1128,8 +1128,8 @@ def test_string_inference(self, tmp_path, pa): result = read_parquet(path, engine="pyarrow") expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype="string[pyarrow_numpy]", - index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) @@ -1159,8 +1159,8 @@ def test_infer_string_large_string_type(self, tmp_path, pa): result = read_parquet(path) expected = pd.DataFrame( data={"a": [None, "b", "c"]}, - dtype="string[pyarrow_numpy]", - columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index b1557d71f15e4..514eaceaccbe6 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -3840,7 +3840,6 @@ class Test(BaseModel): def test_read_sql_string_inference(sqlite_engine): conn = sqlite_engine # GH#54430 - pytest.importorskip("pyarrow") table = "test" df = DataFrame({"a": ["x", "y"]}) df.to_sql(table, con=conn, index=False, if_exists="replace") @@ -3848,7 +3847,7 @@ def test_read_sql_string_inference(sqlite_engine): with pd.option_context("future.infer_string", True): result = read_sql_table(table, conn) - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 324d2a6cfd419..637bce59e9e2c 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -707,19 +707,17 @@ def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype): ) tm.assert_frame_equal(result, expected) - @td.skip_if_no("pyarrow") - def test_get_dummies_ea_dtype(self): + @pytest.mark.parametrize("dtype_type", ["string", "category"]) + def test_get_dummies_ea_dtype(self, dtype_type, string_dtype_no_object): # GH#56273 - for dtype, exp_dtype in [ - ("string[pyarrow]", "boolean"), - ("string[pyarrow_numpy]", "bool"), - (CategoricalDtype(Index(["a"], dtype="string[pyarrow]")), "boolean"), - (CategoricalDtype(Index(["a"], dtype="string[pyarrow_numpy]")), "bool"), - ]: - df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) - result = get_dummies(df) - expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) - tm.assert_frame_equal(result, expected) + dtype = string_dtype_no_object + exp_dtype = "boolean" if dtype.na_value is pd.NA else "bool" + if dtype_type == "category": + dtype = CategoricalDtype(Index(["a"], dtype)) + df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) + result = get_dummies(df) + expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) + tm.assert_frame_equal(result, expected) @td.skip_if_no("pyarrow") def test_get_dummies_arrow_dtype(self): diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 944e61896a182..e58187ba6bcbc 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -1225,9 +1225,9 @@ def test_missing_stubname(self, any_string_dtype): tm.assert_frame_equal(result, expected) -def test_wide_to_long_pyarrow_string_columns(): +def test_wide_to_long_string_columns(string_storage): # GH 57066 - pytest.importorskip("pyarrow") + string_dtype = pd.StringDtype(string_storage, na_value=np.nan) df = DataFrame( { "ID": {0: 1}, @@ -1237,7 +1237,7 @@ def test_wide_to_long_pyarrow_string_columns(): "D": {0: 1}, } ) - df.columns = df.columns.astype("string[pyarrow_numpy]") + df.columns = df.columns.astype(string_dtype) result = wide_to_long( df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*" ) @@ -1247,7 +1247,7 @@ def test_wide_to_long_pyarrow_string_columns(): index=pd.MultiIndex.from_arrays( [ [1, 1, 1], - Index(["test1", "test2", "test3"], dtype="string[pyarrow_numpy]"), + Index(["test1", "test2", "test3"], dtype=string_dtype), ], names=["ID", "UNPIVOTED"], ), diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index b9ddfc189edce..26bdfcbc6ec56 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -11,6 +11,7 @@ DataFrame, Index, Series, + StringDtype, bdate_range, ) import pandas._testing as tm @@ -533,7 +534,7 @@ def test_pyarrow_numpy_string_invalid(self): # GH#56008 pa = pytest.importorskip("pyarrow") ser = Series([False, True]) - ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]") + ser2 = Series(["a", "b"], dtype=StringDtype(na_value=np.nan)) result = ser == ser2 expected_eq = Series(False, index=ser.index) tm.assert_series_equal(result, expected_eq) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 8c5a9b39157ea..f52872c3d2835 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -26,7 +26,7 @@ def using_pyarrow(dtype): - return dtype in ("string[pyarrow]", "string[pyarrow_numpy]") + return dtype == "string" and dtype.storage == "pyarrow" def test_contains(any_string_dtype): diff --git a/pandas/tests/util/test_shares_memory.py b/pandas/tests/util/test_shares_memory.py index 00a897d574a07..8f1ac93b40247 100644 --- a/pandas/tests/util/test_shares_memory.py +++ b/pandas/tests/util/test_shares_memory.py @@ -1,3 +1,5 @@ +import numpy as np + import pandas.util._test_decorators as td import pandas as pd @@ -20,10 +22,10 @@ def test_shares_memory_string(): # GH#55823 import pyarrow as pa - obj = pd.array(["a", "b"], dtype="string[pyarrow]") + obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=pd.NA)) assert tm.shares_memory(obj, obj) - obj = pd.array(["a", "b"], dtype="string[pyarrow_numpy]") + obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=np.nan)) assert tm.shares_memory(obj, obj) obj = pd.array(["a", "b"], dtype=pd.ArrowDtype(pa.string())) From 418f890883e5801dc8220fb1735d7b7196a42a31 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 10 Sep 2024 01:18:29 -0700 Subject: [PATCH 24/34] BUG (string): Series.str.slice with negative step (#59724) Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v2.3.0.rst | 3 ++- pandas/core/arrays/_arrow_string_mixins.py | 31 ++++++++++++++++------ pandas/core/arrays/arrow/array.py | 11 -------- pandas/core/arrays/string_arrow.py | 14 +--------- pandas/tests/extension/test_arrow.py | 1 + pandas/tests/strings/test_strings.py | 1 + 6 files changed, 28 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 03355f655eb28..03b3a6b55dff6 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -103,8 +103,9 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) +- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) - +- Interval ^^^^^^^^ diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 4829b175783ed..042747ae7da1c 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -11,6 +11,7 @@ from pandas.compat import ( pa_version_under10p1, + pa_version_under11p0, pa_version_under13p0, pa_version_under17p0, ) @@ -22,16 +23,13 @@ import pyarrow.compute as pc if TYPE_CHECKING: - from collections.abc import ( - Callable, - Sized, - ) + from collections.abc import Callable from pandas._typing import Scalar class ArrowStringArrayMixin: - _pa_array: Sized + _pa_array: pa.ChunkedArray def __init__(self, *args, **kwargs) -> None: raise NotImplementedError @@ -93,12 +91,29 @@ def _str_get(self, i: int): selected = pc.utf8_slice_codeunits( self._pa_array, start=start, stop=stop, step=step ) - null_value = pa.scalar( - None, type=self._pa_array.type # type: ignore[attr-defined] - ) + null_value = pa.scalar(None, type=self._pa_array.type) result = pc.if_else(not_out_of_bounds, selected, null_value) return type(self)(result) + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if pa_version_under11p0: + # GH#59724 + result = self._apply_elementwise(lambda val: val[start:stop:step]) + return type(self)(pa.chunked_array(result, type=self._pa_array.type)) + if start is None: + if step is not None and step < 0: + # GH#59710 + start = -1 + else: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + def _str_slice_replace( self, start: int | None = None, stop: int | None = None, repl: str | None = None ): diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 861ec0c42c885..764213de87593 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2368,17 +2368,6 @@ def _str_rpartition(self, sep: str, expand: bool): result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ): - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - def _str_len(self): return type(self)(pc.utf8_length(self._pa_array)) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 5ed12e7352bd1..a7a661e8c0cb8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -295,6 +295,7 @@ def _data(self): _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith _str_pad = ArrowStringArrayMixin._str_pad + _str_slice = ArrowStringArrayMixin._str_slice def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -351,19 +352,6 @@ def _str_fullmatch( pat = f"{pat}$" return self._str_match(pat, case, flags, na) - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ): - if stop is None: - return super()._str_slice(start, stop, step) - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - def _str_len(self): result = pc.utf8_length(self._pa_array) return self._convert_int_result(result) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 12f3eedb6b9f1..d0ec87905aa87 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2018,6 +2018,7 @@ def test_str_join_string_type(): [None, 2, None, ["ab", None]], [None, 2, 1, ["ab", None]], [1, 3, 1, ["bc", None]], + (None, None, -1, ["dcba", None]), ], ) def test_str_slice(start, stop, step, exp): diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 015df18221b40..40b6c69dc8025 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -393,6 +393,7 @@ def test_pipe_failures(any_string_dtype): (2, 5, None, ["foo", "bar", np.nan, "baz"]), (0, 3, -1, ["", "", np.nan, ""]), (None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]), + (None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]), (3, 10, 2, ["oto", "ato", np.nan, "aqx"]), (3, 0, -1, ["ofa", "aba", np.nan, "aba"]), ], From 26a0d569a83a400d36d4c45231b421da9bfa8de3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 10 Sep 2024 16:35:18 +0200 Subject: [PATCH 25/34] String dtype: remove fallback Perfomance warnings for string methods (#59760) --- pandas/core/arrays/arrow/_arrow_utils.py | 16 ----- pandas/core/arrays/string_arrow.py | 4 -- pandas/tests/extension/test_string.py | 1 - pandas/tests/indexes/test_setops.py | 12 ---- pandas/tests/strings/test_find_replace.py | 72 ++++++++--------------- pandas/tests/strings/test_string_array.py | 1 - 6 files changed, 23 insertions(+), 83 deletions(-) diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py index 2a053fac2985c..285c3fd465ffc 100644 --- a/pandas/core/arrays/arrow/_arrow_utils.py +++ b/pandas/core/arrays/arrow/_arrow_utils.py @@ -1,24 +1,8 @@ from __future__ import annotations -import warnings - import numpy as np import pyarrow -from pandas.errors import PerformanceWarning -from pandas.util._exceptions import find_stack_level - - -def fallback_performancewarning(version: str | None = None) -> None: - """ - Raise a PerformanceWarning for falling back to ExtensionArray's - non-pyarrow method - """ - msg = "Falling back on a non-pyarrow code path which may decrease performance." - if version is not None: - msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning." - warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) - def pyarrow_array_to_numpy_and_mask( arr, dtype: np.dtype diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index a7a661e8c0cb8..1591253b01345 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -42,8 +42,6 @@ import pyarrow as pa import pyarrow.compute as pc - from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning - if TYPE_CHECKING: from collections.abc import Sequence @@ -301,7 +299,6 @@ def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True ): if flags: - fallback_performancewarning() return super()._str_contains(pat, case, flags, na, regex) if not isna(na): @@ -327,7 +324,6 @@ def _str_replace( regex: bool = True, ): if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: - fallback_performancewarning() return super()._str_replace(pat, repl, n, case, flags, regex) return ArrowExtensionArray._str_replace(self, pat, repl, n, case, flags, regex) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 7f04858318013..354b4d5333c7d 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -212,7 +212,6 @@ def test_compare_scalar(self, data, comparison_op): ser = pd.Series(data) self._compare_other(ser, data, comparison_op, "abc") - @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning") def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 4a6982cf98670..72c3396f124b8 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -240,9 +240,6 @@ def test_intersection_base(self, index): with pytest.raises(TypeError, match=msg): first.intersection([1, 2, 3]) - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_union_base(self, index): index = index.unique() @@ -270,9 +267,6 @@ def test_union_base(self, index): first.union([1, 2, 3]) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) def test_difference_base(self, sort, index): first = index[2:] second = index[:4] @@ -299,9 +293,6 @@ def test_difference_base(self, sort, index): first.difference([1, 2, 3], sort) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) def test_symmetric_difference(self, index): if isinstance(index, CategoricalIndex): pytest.skip(f"Not relevant for {type(index).__name__}") @@ -523,9 +514,6 @@ def test_intersection_difference_match_empty(self, index, sort): @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") -@pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" -) @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] ) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index f52872c3d2835..2742c5b67e57e 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -4,10 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW -from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td import pandas as pd @@ -25,10 +21,6 @@ # -------------------------------------------------------------------------------------- -def using_pyarrow(dtype): - return dtype == "string" and dtype.storage == "pyarrow" - - def test_contains(any_string_dtype): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ @@ -281,10 +273,13 @@ def test_contains_nan(any_string_dtype): # -------------------------------------------------------------------------------------- -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) -def test_startswith_endswith_validate_na(any_string_dtype): +def test_startswith_endswith_validate_na(request, any_string_dtype): + if ( + any_string_dtype == "string" + and any_string_dtype.na_value is np.nan + and any_string_dtype.storage == "python" + ): + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) # GH#59615 ser = Series( ["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"], @@ -462,8 +457,7 @@ def test_replace_mixed_object(): def test_replace_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) + result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) tm.assert_series_equal(result, expected) @@ -483,8 +477,7 @@ def test_replace_callable(any_string_dtype): # test with callable repl = lambda m: m.group(0).swapcase() - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) + result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -502,10 +495,7 @@ def test_replace_callable_raises(any_string_dtype, repl): r"(?(3)required )positional arguments?" ) with pytest.raises(TypeError, match=msg): - with tm.maybe_produces_warning( - PerformanceWarning, using_pyarrow(any_string_dtype) - ): - values.str.replace("a", repl, regex=True) + values.str.replace("a", repl, regex=True) def test_replace_callable_named_groups(any_string_dtype): @@ -513,8 +503,7 @@ def test_replace_callable_named_groups(any_string_dtype): ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, repl, regex=True) + result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -525,13 +514,11 @@ def test_replace_compiled_regex(any_string_dtype): # test with compiled regex pat = re.compile(r"BAD_*") - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, "", regex=True) + result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, "", n=1, regex=True) + result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -552,8 +539,7 @@ def test_replace_compiled_regex_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, ", ", regex=True) + result = ser.str.replace(pat, ", ", regex=True) tm.assert_series_equal(result, expected) @@ -580,8 +566,7 @@ def test_replace_compiled_regex_callable(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, repl, n=2, regex=True) + result = ser.str.replace(pat, repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -629,8 +614,7 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("A", "YYY", case=False) + result = ser.str.replace("A", "YYY", case=False) expected = Series( [ "YYY", @@ -648,8 +632,7 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) + result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( [ "A", @@ -672,13 +655,11 @@ def test_replace_not_case_sensitive_not_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/41602 ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("a", "c", case=False, regex=False) + result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("a.", "c.", case=False, regex=False) + result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -850,8 +831,7 @@ def test_fullmatch_case_kwarg(any_string_dtype): result = ser.str.fullmatch("ab", case=False) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.fullmatch("ab", flags=re.IGNORECASE) + result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected) @@ -1036,17 +1016,13 @@ def test_flags_kwarg(any_string_dtype): pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" - use_pyarrow = using_pyarrow(any_string_dtype) - result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] - with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow): - result = data.str.match(pat, flags=re.IGNORECASE) + result = data.str.match(pat, flags=re.IGNORECASE) assert result.iloc[0] - with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow): - result = data.str.fullmatch(pat, flags=re.IGNORECASE) + result = data.str.fullmatch(pat, flags=re.IGNORECASE) assert result.iloc[0] result = data.str.findall(pat, flags=re.IGNORECASE) @@ -1056,8 +1032,6 @@ def test_flags_kwarg(any_string_dtype): assert result.iloc[0] == 1 msg = "has match groups" - with tm.assert_produces_warning( - UserWarning, match=msg, raise_on_extra_warnings=not use_pyarrow - ): + with tm.assert_produces_warning(UserWarning, match=msg): result = data.str.contains(pat, flags=re.IGNORECASE) assert result.iloc[0] diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index 0b3f368afea5e..517ddb164985c 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -12,7 +12,6 @@ ) -@pytest.mark.filterwarnings("ignore:Falling back") def test_string_array(nullable_string_dtype, any_string_method): method_name, args, kwargs = any_string_method From c8eadfd542ac1791169ead98f351f850e7451369 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 11 Sep 2024 12:40:01 -0700 Subject: [PATCH 26/34] REF (string): de-duplicate ArrowStringArray methods (#59555) --- pandas/core/arrays/_arrow_string_mixins.py | 90 ++++++++++++++++- pandas/core/arrays/arrow/array.py | 86 +---------------- pandas/core/arrays/string_arrow.py | 106 ++++----------------- 3 files changed, 108 insertions(+), 174 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 042747ae7da1c..a39668faf779e 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import partial +import re from typing import ( TYPE_CHECKING, Any, @@ -25,7 +26,10 @@ if TYPE_CHECKING: from collections.abc import Callable - from pandas._typing import Scalar + from pandas._typing import ( + Scalar, + Self, + ) class ArrowStringArrayMixin: @@ -45,6 +49,37 @@ def _convert_int_result(self, result): def _apply_elementwise(self, func: Callable) -> list[list[Any]]: raise NotImplementedError + def _str_len(self): + result = pc.utf8_length(self._pa_array) + return self._convert_int_result(result) + + def _str_lower(self) -> Self: + return type(self)(pc.utf8_lower(self._pa_array)) + + def _str_upper(self) -> Self: + return type(self)(pc.utf8_upper(self._pa_array)) + + def _str_strip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_trim_whitespace(self._pa_array) + else: + result = pc.utf8_trim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_lstrip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_ltrim_whitespace(self._pa_array) + else: + result = pc.utf8_ltrim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_rstrip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_rtrim_whitespace(self._pa_array) + else: + result = pc.utf8_rtrim(self._pa_array, characters=to_strip) + return type(self)(result) + def _str_pad( self, width: int, @@ -125,7 +160,34 @@ def _str_slice_replace( stop = np.iinfo(np.int64).max return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) - def _str_capitalize(self): + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ) -> Self: + if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: + raise NotImplementedError( + "replace is not supported with a re.Pattern, callable repl, " + "case=False, or flags!=0" + ) + + func = pc.replace_substring_regex if regex else pc.replace_substring + # https://github.com/apache/arrow/issues/39149 + # GH 56404, unexpected behavior with negative max_replacements with pyarrow. + pa_max_replacements = None if n < 0 else n + result = func( + self._pa_array, + pattern=pat, + replacement=repl, + max_replacements=pa_max_replacements, + ) + return type(self)(result) + + def _str_capitalize(self) -> Self: return type(self)(pc.utf8_capitalize(self._pa_array)) def _str_title(self): @@ -134,6 +196,16 @@ def _str_title(self): def _str_swapcase(self): return type(self)(pc.utf8_swapcase(self._pa_array)) + def _str_removeprefix(self, prefix: str): + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) + predicate = lambda val: val.removeprefix(prefix) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + def _str_removesuffix(self, suffix: str): ends_with = pc.ends_with(self._pa_array, pattern=suffix) removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) @@ -225,6 +297,20 @@ def _str_contains( result = result.fill_null(na) return self._convert_bool_result(result) + def _str_match( + self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.startswith("^"): + pat = f"^{pat}" + return self._str_contains(pat, case, flags, na, regex=True) + + def _str_fullmatch( + self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.endswith("$") or pat.endswith("\\$"): + pat = f"{pat}$" + return self._str_match(pat, case, flags, na) + def _str_find(self, sub: str, start: int = 0, end: int | None = None): if ( pa_version_under13p0 diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 764213de87593..56f38cc4f5361 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1989,7 +1989,7 @@ def _rank( """ See Series.rank.__doc__. """ - return type(self)( + return self._convert_int_result( self._rank_calc( axis=axis, method=method, @@ -2296,36 +2296,6 @@ def _str_count(self, pat: str, flags: int = 0): raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _result_converter(self, result): - return type(self)(result) - - def _str_replace( - self, - pat: str | re.Pattern, - repl: str | Callable, - n: int = -1, - case: bool = True, - flags: int = 0, - regex: bool = True, - ): - if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: - raise NotImplementedError( - "replace is not supported with a re.Pattern, callable repl, " - "case=False, or flags!=0" - ) - - func = pc.replace_substring_regex if regex else pc.replace_substring - # https://github.com/apache/arrow/issues/39149 - # GH 56404, unexpected behavior with negative max_replacements with pyarrow. - pa_max_replacements = None if n < 0 else n - result = func( - self._pa_array, - pattern=pat, - replacement=repl, - max_replacements=pa_max_replacements, - ) - return type(self)(result) - def _str_repeat(self, repeats: int | Sequence[int]): if not isinstance(repeats, int): raise NotImplementedError( @@ -2334,20 +2304,6 @@ def _str_repeat(self, repeats: int | Sequence[int]): else: return type(self)(pc.binary_repeat(self._pa_array, repeats)) - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.startswith("^"): - pat = f"^{pat}" - return self._str_contains(pat, case, flags, na, regex=True) - - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.endswith("$") or pat.endswith("\\$"): - pat = f"{pat}$" - return self._str_match(pat, case, flags, na) - def _str_join(self, sep: str): if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( self._pa_array.type @@ -2368,46 +2324,6 @@ def _str_rpartition(self, sep: str, expand: bool): result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_len(self): - return type(self)(pc.utf8_length(self._pa_array)) - - def _str_lower(self): - return type(self)(pc.utf8_lower(self._pa_array)) - - def _str_upper(self): - return type(self)(pc.utf8_upper(self._pa_array)) - - def _str_strip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_trim_whitespace(self._pa_array) - else: - result = pc.utf8_trim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_lstrip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._pa_array) - else: - result = pc.utf8_ltrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_rstrip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._pa_array) - else: - result = pc.utf8_rtrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_removeprefix(self, prefix: str): - if not pa_version_under13p0: - starts_with = pc.starts_with(self._pa_array, pattern=prefix) - removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else(starts_with, removed, self._pa_array) - return type(self)(result) - predicate = lambda val: val.removeprefix(prefix) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - def _str_casefold(self): predicate = lambda val: val.casefold() result = self._apply_elementwise(predicate) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 1591253b01345..80651dcdaebe1 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -48,9 +48,7 @@ from pandas._typing import ( ArrayLike, - AxisInt, Dtype, - Scalar, npt, ) @@ -293,6 +291,20 @@ def _data(self): _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith _str_pad = ArrowStringArrayMixin._str_pad + _str_match = ArrowStringArrayMixin._str_match + _str_fullmatch = ArrowStringArrayMixin._str_fullmatch + _str_lower = ArrowStringArrayMixin._str_lower + _str_upper = ArrowStringArrayMixin._str_upper + _str_strip = ArrowStringArrayMixin._str_strip + _str_lstrip = ArrowStringArrayMixin._str_lstrip + _str_rstrip = ArrowStringArrayMixin._str_rstrip + _str_removesuffix = ArrowStringArrayMixin._str_removesuffix + _str_get = ArrowStringArrayMixin._str_get + _str_capitalize = ArrowStringArrayMixin._str_capitalize + _str_title = ArrowStringArrayMixin._str_title + _str_swapcase = ArrowStringArrayMixin._str_swapcase + _str_slice_replace = ArrowStringArrayMixin._str_slice_replace + _str_len = ArrowStringArrayMixin._str_len _str_slice = ArrowStringArrayMixin._str_slice def _str_contains( @@ -326,73 +338,21 @@ def _str_replace( if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: return super()._str_replace(pat, repl, n, case, flags, regex) - return ArrowExtensionArray._str_replace(self, pat, repl, n, case, flags, regex) + return ArrowStringArrayMixin._str_replace( + self, pat, repl, n, case, flags, regex + ) def _str_repeat(self, repeats: int | Sequence[int]): if not isinstance(repeats, int): return super()._str_repeat(repeats) else: - return type(self)(pc.binary_repeat(self._pa_array, repeats)) - - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.startswith("^"): - pat = f"^{pat}" - return self._str_contains(pat, case, flags, na, regex=True) - - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.endswith("$") or pat.endswith("\\$"): - pat = f"{pat}$" - return self._str_match(pat, case, flags, na) - - def _str_len(self): - result = pc.utf8_length(self._pa_array) - return self._convert_int_result(result) - - def _str_lower(self): - return type(self)(pc.utf8_lower(self._pa_array)) - - def _str_upper(self): - return type(self)(pc.utf8_upper(self._pa_array)) - - def _str_strip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_trim_whitespace(self._pa_array) - else: - result = pc.utf8_trim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_lstrip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._pa_array) - else: - result = pc.utf8_ltrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_rstrip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._pa_array) - else: - result = pc.utf8_rtrim(self._pa_array, characters=to_strip) - return type(self)(result) + return ArrowExtensionArray._str_repeat(self, repeats=repeats) def _str_removeprefix(self, prefix: str): if not pa_version_under13p0: - starts_with = pc.starts_with(self._pa_array, pattern=prefix) - removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else(starts_with, removed, self._pa_array) - return type(self)(result) + return ArrowStringArrayMixin._str_removeprefix(self, prefix) return super()._str_removeprefix(prefix) - def _str_removesuffix(self, suffix: str): - ends_with = pc.ends_with(self._pa_array, pattern=suffix) - removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) - result = pc.if_else(ends_with, removed, self._pa_array) - return type(self)(result) - def _str_count(self, pat: str, flags: int = 0): if flags: return super()._str_count(pat, flags) @@ -449,28 +409,6 @@ def _reduce( else: return result - def _rank( - self, - *, - axis: AxisInt = 0, - method: str = "average", - na_option: str = "keep", - ascending: bool = True, - pct: bool = False, - ): - """ - See Series.rank.__doc__. - """ - return self._convert_int_result( - self._rank_calc( - axis=axis, - method=method, - na_option=na_option, - ascending=ascending, - pct=pct, - ) - ) - def value_counts(self, dropna: bool = True) -> Series: result = super().value_counts(dropna=dropna) if self.dtype.na_value is np.nan: @@ -492,9 +430,3 @@ def _cmp_method(self, other, op): class ArrowStringArrayNumpySemantics(ArrowStringArray): _na_value = np.nan - _str_get = ArrowStringArrayMixin._str_get - _str_removesuffix = ArrowStringArrayMixin._str_removesuffix - _str_capitalize = ArrowStringArrayMixin._str_capitalize - _str_title = ArrowStringArrayMixin._str_title - _str_swapcase = ArrowStringArrayMixin._str_swapcase - _str_slice_replace = ArrowStringArrayMixin._str_slice_replace From 37886a6abb42e2f75518c8b41907211cf1fa8a97 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 12 Sep 2024 23:08:34 +0200 Subject: [PATCH 27/34] BUG/API (string dtype): return float dtype for series[str].rank() (#59768) * BUG/API (string dtype): return float dtype for series[str].rank() * update frame tests * add whatsnew * correct whatsnew note --- doc/source/whatsnew/v2.3.0.rst | 1 + pandas/core/arrays/arrow/array.py | 5 +- pandas/core/arrays/string_arrow.py | 11 ++++ pandas/tests/frame/methods/test_rank.py | 23 ++------ pandas/tests/series/methods/test_rank.py | 72 ++++++++++++++++++------ 5 files changed, 76 insertions(+), 36 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 03b3a6b55dff6..01c2ed3821d7a 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -102,6 +102,7 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) - Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 56f38cc4f5361..e0ccbd6fdc5fd 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1989,7 +1989,7 @@ def _rank( """ See Series.rank.__doc__. """ - return self._convert_int_result( + return self._convert_rank_result( self._rank_calc( axis=axis, method=method, @@ -2291,6 +2291,9 @@ def _convert_bool_result(self, result): def _convert_int_result(self, result): return type(self)(result) + def _convert_rank_result(self, result): + return type(self)(result) + def _str_count(self, pat: str, flags: int = 0): if flags: raise NotImplementedError(f"count not implemented with {flags=}") diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 80651dcdaebe1..56f7d3aecce20 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -30,6 +30,7 @@ from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.arrow import ArrowExtensionArray from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.floating import Float64Dtype from pandas.core.arrays.integer import Int64Dtype from pandas.core.arrays.numeric import NumericDtype from pandas.core.arrays.string_ import ( @@ -388,6 +389,16 @@ def _convert_int_result(self, result): return Int64Dtype().__from_arrow__(result) + def _convert_rank_result(self, result): + if self.dtype.na_value is np.nan: + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() + return result.astype("float64", copy=False) + + return Float64Dtype().__from_arrow__(result) + def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 82722eeb1af72..37bed2da05743 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -6,15 +6,11 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.algos import ( Infinity, NegInfinity, ) -from pandas.compat import HAS_PYARROW -import pandas as pd from pandas import ( DataFrame, Index, @@ -474,23 +470,10 @@ def test_rank_inf_nans_na_option( ("top", False, [2.0, 3.0, 1.0, 4.0]), ], ) - def test_rank_object_first( - self, - request, - frame_or_series, - na_option, - ascending, - expected, - using_infer_string, - ): + def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): obj = frame_or_series(["foo", "foo", None, "foo"]) - if using_string_dtype() and not HAS_PYARROW and isinstance(obj, Series): - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) - result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) - if using_infer_string and isinstance(obj, Series): - expected = expected.astype("uint64") tm.assert_equal(result, expected) @pytest.mark.parametrize( @@ -514,7 +497,9 @@ def test_rank_string_dtype(self, string_dtype_no_object): # GH#55362 obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object) result = obj.rank(method="first") - exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64" + exp_dtype = ( + "Float64" if string_dtype_no_object == "string[pyarrow]" else "float64" + ) if string_dtype_no_object.storage == "python": # TODO nullable string[python] should also return nullable Int64 exp_dtype = "float64" diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 24cf97c05c0a8..f0fe1d989941e 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -33,7 +33,8 @@ def ser(): ["max", np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6])], ["first", np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6])], ["dense", np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])], - ] + ], + ids=lambda x: x[0], ) def results(request): return request.param @@ -48,12 +49,29 @@ def results(request): "Int64", pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + "string[python]", + "str", ] ) def dtype(request): return request.param +def expected_dtype(dtype, method, pct=False): + exp_dtype = "float64" + # elif dtype in ["Int64", "Float64", "string[pyarrow]", "string[python]"]: + if dtype in ["string[pyarrow]"]: + exp_dtype = "Float64" + elif dtype in ["float64[pyarrow]", "int64[pyarrow]"]: + if method == "average" or pct: + exp_dtype = "double[pyarrow]" + else: + exp_dtype = "uint64[pyarrow]" + + return exp_dtype + + class TestSeriesRank: def test_rank(self, datetime_series): sp_stats = pytest.importorskip("scipy.stats") @@ -241,12 +259,14 @@ def test_rank_signature(self): with pytest.raises(ValueError, match=msg): s.rank("average") - @pytest.mark.parametrize("dtype", [None, object]) - def test_rank_tie_methods(self, ser, results, dtype): + def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): method, exp = results + if dtype == "int64" or (not using_infer_string and dtype == "str"): + pytest.skip("int64/str does not support NaN") + ser = ser if dtype is None else ser.astype(dtype) result = ser.rank(method=method) - tm.assert_series_equal(result, Series(exp)) + tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method))) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) @@ -346,25 +366,35 @@ def test_rank_methods_series(self, method, op, value): ], ) def test_rank_dense_method(self, dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="dense") - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "dense")) tm.assert_series_equal(result, expected) - def test_rank_descending(self, ser, results, dtype): + def test_rank_descending(self, ser, results, dtype, using_infer_string): method, _ = results - if "i" in dtype: + if dtype == "int64" or (not using_infer_string and dtype == "str"): s = ser.dropna() else: s = ser.astype(dtype) res = s.rank(ascending=False) - expected = (s.max() - s).rank() - tm.assert_series_equal(res, expected) + if dtype.startswith("str"): + expected = (s.astype("float64").max() - s.astype("float64")).rank() + else: + expected = (s.max() - s).rank() + tm.assert_series_equal(res, expected.astype(expected_dtype(dtype, "average"))) - expected = (s.max() - s).rank(method=method) + if dtype.startswith("str"): + expected = (s.astype("float64").max() - s.astype("float64")).rank( + method=method + ) + else: + expected = (s.max() - s).rank(method=method) res2 = s.rank(method=method, ascending=False) - tm.assert_series_equal(res2, expected) + tm.assert_series_equal(res2, expected.astype(expected_dtype(dtype, method))) def test_rank_int(self, ser, results): method, exp = results @@ -421,9 +451,11 @@ def test_rank_ea_small_values(self): ], ) def test_rank_dense_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="dense", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "dense", pct=True)) tm.assert_series_equal(result, expected) @@ -442,9 +474,11 @@ def test_rank_dense_pct(dtype, ser, exp): ], ) def test_rank_min_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="min", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "min", pct=True)) tm.assert_series_equal(result, expected) @@ -463,9 +497,11 @@ def test_rank_min_pct(dtype, ser, exp): ], ) def test_rank_max_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="max", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "max", pct=True)) tm.assert_series_equal(result, expected) @@ -484,9 +520,11 @@ def test_rank_max_pct(dtype, ser, exp): ], ) def test_rank_average_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="average", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "average", pct=True)) tm.assert_series_equal(result, expected) @@ -505,9 +543,11 @@ def test_rank_average_pct(dtype, ser, exp): ], ) def test_rank_first_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="first", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "first", pct=True)) tm.assert_series_equal(result, expected) From 532b9a1b0646a6746f494c147e698b061705832b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 12 Sep 2024 23:11:52 +0200 Subject: [PATCH 28/34] String dtype: fix isin() values handling for python storage (#59759) * String dtype: fix isin() values handling for python storage * address feedback --- pandas/conftest.py | 9 ++++- pandas/core/arrays/string_.py | 20 +++++++++++ pandas/tests/arrays/string_/test_string.py | 41 +++++++++++++++++++--- 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index f957289ea52e8..c6237d0309630 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1294,7 +1294,13 @@ def string_storage(request): pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), ("python", np.nan), - ] + ], + ids=[ + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], ) def string_dtype_arguments(request): """ @@ -1325,6 +1331,7 @@ def dtype_backend(request): # Alias so we can test with cartesian product of string_storage string_storage2 = string_storage +string_dtype_arguments2 = string_dtype_arguments @pytest.fixture(params=tm.BYTES_DTYPES) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 43c46a4308f9e..0b0fffcb928a3 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -46,6 +46,7 @@ nanops, ops, ) +from pandas.core.algorithms import isin from pandas.core.array_algos import masked_reductions from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import ( @@ -65,6 +66,7 @@ import pyarrow from pandas._typing import ( + ArrayLike, AxisInt, Dtype, DtypeObj, @@ -733,6 +735,24 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: # base class implementation that uses __setitem__ ExtensionArray._putmask(self, mask, value) + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: + if isinstance(values, BaseStringArray) or ( + isinstance(values, ExtensionArray) and is_string_dtype(values.dtype) + ): + values = values.astype(self.dtype, copy=False) + else: + if not lib.is_string_array(np.asarray(values), skipna=True): + values = np.array( + [val for val in values if isinstance(val, str) or isna(val)], + dtype=object, + ) + if not len(values): + return np.zeros(self.shape, dtype=bool) + + values = self._from_sequence(values, dtype=self.dtype) + + return isin(np.asarray(self), np.asarray(values)) + def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index d3a0897f88f61..265b9fc40629b 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -29,6 +29,12 @@ def dtype(string_dtype_arguments): return pd.StringDtype(storage=storage, na_value=na_value) +@pytest.fixture +def dtype2(string_dtype_arguments2): + storage, na_value = string_dtype_arguments2 + return pd.StringDtype(storage=storage, na_value=na_value) + + @pytest.fixture def cls(dtype): """Fixture giving array type from parametrized 'dtype'""" @@ -689,11 +695,7 @@ def test_isin(dtype, fixed_now_ts): tm.assert_series_equal(result, expected) result = s.isin(["a", pd.NA]) - if dtype.storage == "python" and dtype.na_value is np.nan: - # TODO(infer_string) we should make this consistent - expected = pd.Series([True, False, False]) - else: - expected = pd.Series([True, False, True]) + expected = pd.Series([True, False, True]) tm.assert_series_equal(result, expected) result = s.isin([]) @@ -704,6 +706,35 @@ def test_isin(dtype, fixed_now_ts): expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected) + result = s.isin([fixed_now_ts]) + expected = pd.Series([False, False, False]) + tm.assert_series_equal(result, expected) + + +def test_isin_string_array(dtype, dtype2): + s = pd.Series(["a", "b", None], dtype=dtype) + + result = s.isin(pd.array(["a", "c"], dtype=dtype2)) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(pd.array(["a", None], dtype=dtype2)) + expected = pd.Series([True, False, True]) + tm.assert_series_equal(result, expected) + + +def test_isin_arrow_string_array(dtype): + pa = pytest.importorskip("pyarrow") + s = pd.Series(["a", "b", None], dtype=dtype) + + result = s.isin(pd.array(["a", "c"], dtype=pd.ArrowDtype(pa.string()))) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(pd.array(["a", None], dtype=pd.ArrowDtype(pa.string()))) + expected = pd.Series([True, False, True]) + tm.assert_series_equal(result, expected) + def test_setitem_scalar_with_mask_validation(dtype): # https://github.com/pandas-dev/pandas/issues/47628 From 4ff2c686df3eda2e418d289fc776048b43b3de18 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Sep 2024 19:25:59 +0200 Subject: [PATCH 29/34] String dtype: allow string dtype in query/eval with default numexpr engine (#59810) String dtype: allow string dtype in query/eval with default mumexpr engine --- pandas/core/computation/eval.py | 12 +++++++++--- pandas/core/computation/expr.py | 6 +++++- pandas/tests/frame/test_query_eval.py | 21 ++++++--------------- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index f1fe528de06f8..7bb623cba3755 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -10,7 +10,10 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg -from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.common import ( + is_extension_array_dtype, + is_string_dtype, +) from pandas.core.computation.engines import ENGINES from pandas.core.computation.expr import ( @@ -336,10 +339,13 @@ def eval( parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) if engine == "numexpr" and ( - is_extension_array_dtype(parsed_expr.terms.return_type) + ( + is_extension_array_dtype(parsed_expr.terms.return_type) + and not is_string_dtype(parsed_expr.terms.return_type) + ) or getattr(parsed_expr.terms, "operand_types", None) is not None and any( - is_extension_array_dtype(elem) + (is_extension_array_dtype(elem) and not is_string_dtype(elem)) for elem in parsed_expr.terms.operand_types ) ): diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index d642c37cea129..34055d2177626 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -20,6 +20,8 @@ from pandas.errors import UndefinedVariableError +from pandas.core.dtypes.common import is_string_dtype + import pandas.core.common as com from pandas.core.computation.ops import ( ARITH_OPS_SYMS, @@ -520,10 +522,12 @@ def _maybe_evaluate_binop( elif self.engine != "pytables": if ( getattr(lhs, "return_type", None) == object + or is_string_dtype(getattr(lhs, "return_type", None)) or getattr(rhs, "return_type", None) == object + or is_string_dtype(getattr(rhs, "return_type", None)) ): # evaluate "==" and "!=" in python if either of our operands - # has an object return type + # has an object or string return type return self._maybe_eval(res, eval_in_python + maybe_eval_in_python) return res diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 7dde0683aa960..27848e4d18596 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( NumExprClobberingError, UndefinedVariableError, @@ -747,7 +745,6 @@ def test_inf(self, op, f, engine, parser): result = df.query(q, engine=engine, parser=parser) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_check_tz_aware_index_query(self, tz_aware_fixture): # https://github.com/pandas-dev/pandas/issues/29463 tz = tz_aware_fixture @@ -760,6 +757,7 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture): tm.assert_frame_equal(result, expected) expected = DataFrame(df_index) + expected.columns = expected.columns.astype(object) result = df.reset_index().query('"2018-01-03 00:00:00+00" < time') tm.assert_frame_equal(result, expected) @@ -1057,7 +1055,7 @@ def test_query_with_string_columns(self, parser, engine): with pytest.raises(NotImplementedError, match=msg): df.query("a in b and c < d", parser=parser, engine=engine) - def test_object_array_eq_ne(self, parser, engine, using_infer_string): + def test_object_array_eq_ne(self, parser, engine): df = DataFrame( { "a": list("aaaabbbbcccc"), @@ -1066,14 +1064,11 @@ def test_object_array_eq_ne(self, parser, engine, using_infer_string): "d": np.random.default_rng(2).integers(9, size=12), } ) - warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None - with tm.assert_produces_warning(warning): - res = df.query("a == b", parser=parser, engine=engine) + res = df.query("a == b", parser=parser, engine=engine) exp = df[df.a == df.b] tm.assert_frame_equal(res, exp) - with tm.assert_produces_warning(warning): - res = df.query("a != b", parser=parser, engine=engine) + res = df.query("a != b", parser=parser, engine=engine) exp = df[df.a != df.b] tm.assert_frame_equal(res, exp) @@ -1112,16 +1107,12 @@ def test_query_with_nested_special_character(self, parser, engine): [">=", operator.ge], ], ) - def test_query_lex_compare_strings( - self, parser, engine, op, func, using_infer_string - ): + def test_query_lex_compare_strings(self, parser, engine, op, func): a = Series(np.random.default_rng(2).choice(list("abcde"), 20)) b = Series(np.arange(a.size)) df = DataFrame({"X": a, "Y": b}) - warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None - with tm.assert_produces_warning(warning): - res = df.query(f'X {op} "d"', engine=engine, parser=parser) + res = df.query(f'X {op} "d"', engine=engine, parser=parser) expected = df[func(df.X, "d")] tm.assert_frame_equal(res, expected) From 2789338a45d4959946856eedb33de36a206e3e2d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 25 Sep 2024 19:57:49 +0200 Subject: [PATCH 30/34] String dtype: map builtin str alias to StringDtype (#59685) * String dtype: map builtin str alias to StringDtype * fix tests * fix datetimelike astype and more tests * remove xfails * try fix typing * fix copy_view tests * fix remaining tests with infer_string enabled * ignore typing issue for now * move to common.py * simplify Categorical._str_get_dummies * small cleanup * fix ensure_string_array to not modify extension arrays inplace * fix ensure_string_array once more + fix is_extension_array_dtype for str * still xfail TestArrowArray::test_astype_str when not using infer_string * ensure maybe_convert_objects copies object dtype input array when inferring StringDtype * update test_1d_object_array_does_not_copy test * update constructor copy test + do not copy in maybe_convert_objects? * skip str.get_dummies test for now * use pandas_dtype() instead of registry.find * fix corner cases for calling pandas_dtype * add TODO comment in ensure_string_array --- pandas/_libs/lib.pyx | 9 +++- pandas/_testing/__init__.py | 2 +- pandas/core/arrays/categorical.py | 4 +- pandas/core/arrays/datetimelike.py | 10 ++++- pandas/core/dtypes/common.py | 18 +++++++- pandas/core/indexes/base.py | 6 ++- pandas/core/indexes/interval.py | 3 +- pandas/tests/arrays/floating/test_astype.py | 6 +-- pandas/tests/arrays/integer/test_dtypes.py | 6 +-- pandas/tests/arrays/sparse/test_astype.py | 4 +- pandas/tests/arrays/sparse/test_dtype.py | 2 +- pandas/tests/dtypes/test_common.py | 12 ++++++ pandas/tests/extension/base/casting.py | 4 +- pandas/tests/extension/json/array.py | 3 +- pandas/tests/extension/test_arrow.py | 29 +++---------- pandas/tests/frame/methods/test_astype.py | 17 ++++---- .../tests/frame/methods/test_select_dtypes.py | 5 ++- pandas/tests/frame/test_constructors.py | 41 +++++++++++++++---- .../indexes/datetimes/methods/test_astype.py | 15 ++++--- pandas/tests/indexes/object/test_astype.py | 4 +- .../indexes/period/methods/test_astype.py | 9 +++- .../indexes/timedeltas/methods/test_astype.py | 9 +++- pandas/tests/interchange/test_impl.py | 1 + pandas/tests/io/excel/test_readers.py | 8 ++-- .../io/parser/dtypes/test_dtypes_basic.py | 17 ++++---- pandas/tests/io/parser/test_na_values.py | 2 - .../io/parser/test_python_parser_only.py | 6 +-- pandas/tests/series/methods/test_astype.py | 30 ++++++++------ pandas/tests/series/methods/test_map.py | 4 +- pandas/tests/series/test_constructors.py | 2 +- pandas/tests/test_algos.py | 7 +++- 31 files changed, 183 insertions(+), 112 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d93099cd79d1b..c23f907aecfab 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -755,7 +755,14 @@ cpdef ndarray[object] ensure_string_array( if hasattr(arr, "to_numpy"): - if hasattr(arr, "dtype") and arr.dtype.kind in "mM": + if ( + hasattr(arr, "dtype") + and arr.dtype.kind in "mM" + # TODO: we should add a custom ArrowExtensionArray.astype implementation + # that handles astype(str) specifically, avoiding ending up here and + # then we can remove the below check for `_pa_array` (for ArrowEA) + and not hasattr(arr, "_pa_array") + ): # dtype check to exclude DataFrame # GH#41409 TODO: not a great place for this out = arr.astype(str).astype(object) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 10c1c490551fb..3aa7c64831efe 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -112,7 +112,7 @@ COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"] if using_string_dtype(): - STRING_DTYPES: list[Dtype] = [str, "U"] + STRING_DTYPES: list[Dtype] = ["U"] else: STRING_DTYPES: list[Dtype] = [str, "str", "U"] # type: ignore[no-redef] COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6ffc0df243130..97004474648b2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2691,7 +2691,9 @@ def _str_get_dummies(self, sep: str = "|"): # sep may not be in categories. Just bail on this. from pandas.core.arrays import NumpyExtensionArray - return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep) + return NumpyExtensionArray(self.to_numpy(str, na_value="NaN"))._str_get_dummies( + sep + ) # ------------------------------------------------------------------------ # GroupBy Methods diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e85c0222bbec3..81e2f04f2ba2e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -472,10 +472,16 @@ def astype(self, dtype, copy: bool = True): return self._box_values(self.asi8.ravel()).reshape(self.shape) + elif is_string_dtype(dtype): + if isinstance(dtype, ExtensionDtype): + arr_object = self._format_native_types(na_rep=dtype.na_value) # type: ignore[arg-type] + cls = dtype.construct_array_type() + return cls._from_sequence(arr_object, dtype=dtype, copy=False) + else: + return self._format_native_types() + elif isinstance(dtype, ExtensionDtype): return super().astype(dtype, copy=copy) - elif is_string_dtype(dtype): - return self._format_native_types() elif dtype.kind in "iu": # we deliberately ignore int32 vs. int64 here. # See https://github.com/pandas-dev/pandas/issues/24381 for more. diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index df0251d141984..fe705daaad5fa 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -12,6 +12,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( Interval, Period, @@ -1325,7 +1327,15 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: elif isinstance(dtype, np.dtype): return False else: - return registry.find(dtype) is not None + try: + with warnings.catch_warnings(): + # pandas_dtype(..) can raise UserWarning for class input + warnings.simplefilter("ignore", UserWarning) + dtype = pandas_dtype(dtype) + except (TypeError, ValueError): + # np.dtype(..) can raise ValueError + return False + return isinstance(dtype, ExtensionDtype) def is_ea_or_datetimelike_dtype(dtype: DtypeObj | None) -> bool: @@ -1620,6 +1630,12 @@ def pandas_dtype(dtype) -> DtypeObj: elif isinstance(dtype, (np.dtype, ExtensionDtype)): return dtype + # builtin aliases + if dtype is str and using_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + return StringDtype(na_value=np.nan) + # registered extension types result = registry.find(dtype) if result is not None: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a28c98ecc5cee..8e8eb768130fd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6415,7 +6415,11 @@ def _should_compare(self, other: Index) -> bool: return False dtype = _unpack_nested_dtype(other) - return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) + return ( + self._is_comparable_dtype(dtype) + or is_object_dtype(dtype) + or is_string_dtype(dtype) + ) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 4fcdb87974511..635924674d9f4 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -50,6 +50,7 @@ is_number, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -699,7 +700,7 @@ def _get_indexer( # left/right get_indexer, compare elementwise, equality -> match indexer = self._get_indexer_unique_sides(target) - elif not is_object_dtype(target.dtype): + elif not (is_object_dtype(target.dtype) or is_string_dtype(target.dtype)): # homogeneous scalar index: use IntervalTree # we should always have self._should_partial_index(target) here target = self._maybe_convert_i8(target) diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index ccf644b34051d..752ebe194ffcf 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -68,11 +68,9 @@ def test_astype_str(using_infer_string): if using_infer_string: expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan)) - tm.assert_extension_array_equal(a.astype("str"), expected) - # TODO(infer_string) this should also be a string array like above - expected = np.array(["0.1", "0.2", ""], dtype="U32") - tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype("str"), expected) else: expected = np.array(["0.1", "0.2", ""], dtype="U32") diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index 7be00e569b3fe..90879d8bd3063 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -283,11 +283,9 @@ def test_astype_str(using_infer_string): if using_infer_string: expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan)) - tm.assert_extension_array_equal(a.astype("str"), expected) - # TODO(infer_string) this should also be a string array like above - expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") - tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype("str"), expected) else: expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") diff --git a/pandas/tests/arrays/sparse/test_astype.py b/pandas/tests/arrays/sparse/test_astype.py index 83a507e679d46..e6e4a11a0f5ab 100644 --- a/pandas/tests/arrays/sparse/test_astype.py +++ b/pandas/tests/arrays/sparse/test_astype.py @@ -81,8 +81,8 @@ def test_astype_all(self, any_real_numpy_dtype): ), ( SparseArray([0, 1, 10]), - str, - SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")), + np.str_, + SparseArray(["0", "1", "10"], dtype=SparseDtype(np.str_, "0")), ), (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])), ( diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index 234f4092421e5..149c28341ba3d 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -177,7 +177,7 @@ def test_construct_from_string_fill_value_raises(string): [ (SparseDtype(int, 0), float, SparseDtype(float, 0.0)), (SparseDtype(int, 1), float, SparseDtype(float, 1.0)), - (SparseDtype(int, 1), str, SparseDtype(object, "1")), + (SparseDtype(int, 1), np.str_, SparseDtype(object, "1")), (SparseDtype(float, 1.5), int, SparseDtype(int, 1)), ], ) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index ccd30caba5dee..f7442cf5d6d3c 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -810,11 +810,23 @@ def test_pandas_dtype_string_dtypes(string_storage): "pyarrow" if HAS_PYARROW else "python", na_value=np.nan ) + with pd.option_context("future.infer_string", True): + # with the default string_storage setting + result = pandas_dtype(str) + assert result == pd.StringDtype( + "pyarrow" if HAS_PYARROW else "python", na_value=np.nan + ) + with pd.option_context("future.infer_string", True): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") assert result == pd.StringDtype(string_storage, na_value=np.nan) + with pd.option_context("future.infer_string", True): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype(str) + assert result == pd.StringDtype(string_storage, na_value=np.nan) + with pd.option_context("future.infer_string", False): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 2bfe801c48a77..56879129c3a28 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -43,8 +43,8 @@ def test_tolist(self, data): assert result == expected def test_astype_str(self, data): - result = pd.Series(data[:5]).astype(str) - expected = pd.Series([str(x) for x in data[:5]], dtype=str) + result = pd.Series(data[:2]).astype(str) + expected = pd.Series([str(x) for x in data[:2]], dtype=str) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index e43b50322bb92..5cbd45a99ae5c 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -207,9 +207,8 @@ def astype(self, dtype, copy=True): return self.copy() return self elif isinstance(dtype, StringDtype): - value = self.astype(str) # numpy doesn't like nested dicts arr_cls = dtype.construct_array_type() - return arr_cls._from_sequence(value, dtype=dtype, copy=False) + return arr_cls._from_sequence(self, dtype=dtype, copy=False) elif not copy: return np.asarray([dict(x) for x in self], dtype=dtype) else: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d0ec87905aa87..60e7bd83432c5 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -41,7 +41,6 @@ pa_version_under13p0, pa_version_under14p0, ) -import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -286,7 +285,7 @@ def test_map(self, data_missing, na_action): expected = data_missing.to_numpy() tm.assert_numpy_array_equal(result, expected) - def test_astype_str(self, data, request): + def test_astype_str(self, data, request, using_infer_string): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_binary(pa_dtype): request.applymarker( @@ -294,9 +293,10 @@ def test_astype_str(self, data, request): reason=f"For {pa_dtype} .astype(str) decodes.", ) ) - elif ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ) or pa.types.is_duration(pa_dtype): + elif not using_infer_string and ( + (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) + or pa.types.is_duration(pa_dtype) + ): request.applymarker( pytest.mark.xfail( reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", @@ -304,25 +304,6 @@ def test_astype_str(self, data, request): ) super().test_astype_str(data) - @pytest.mark.parametrize( - "nullable_string_dtype", - [ - "string[python]", - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], - ) - def test_astype_string(self, data, nullable_string_dtype, request): - pa_dtype = data.dtype.pyarrow_dtype - if ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ) or pa.types.is_duration(pa_dtype): - request.applymarker( - pytest.mark.xfail( - reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", - ) - ) - super().test_astype_string(data, nullable_string_dtype) - def test_from_dtype(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 9c27e76de91b2..ca3764ac87e95 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -169,21 +169,21 @@ def test_astype_str(self): "d": list(map(str, d._values)), "e": list(map(str, e._values)), }, - dtype="object", + dtype="str", ) tm.assert_frame_equal(result, expected) - def test_astype_str_float(self): + def test_astype_str_float(self, using_infer_string): # see GH#11302 result = DataFrame([np.nan]).astype(str) - expected = DataFrame(["nan"], dtype="object") + expected = DataFrame([np.nan if using_infer_string else "nan"], dtype="str") tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) val = "1.1234567890123457" - expected = DataFrame([val], dtype="object") + expected = DataFrame([val], dtype="str") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) @@ -285,7 +285,7 @@ def test_astype_duplicate_col_series_arg(self): result = df.astype(dtypes) expected = DataFrame( { - 0: Series(vals[:, 0].astype(str), dtype=object), + 0: Series(vals[:, 0].astype(str), dtype="str"), 1: vals[:, 1], 2: pd.array(vals[:, 2], dtype="Float64"), 3: vals[:, 3], @@ -666,9 +666,10 @@ def test_astype_dt64tz(self, timezone_frame): # dt64tz->dt64 deprecated timezone_frame.astype("datetime64[ns]") - def test_astype_dt64tz_to_str(self, timezone_frame): + def test_astype_dt64tz_to_str(self, timezone_frame, using_infer_string): # str formatting result = timezone_frame.astype(str) + na_value = np.nan if using_infer_string else "NaT" expected = DataFrame( [ [ @@ -676,7 +677,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): "2013-01-01 00:00:00-05:00", "2013-01-01 00:00:00+01:00", ], - ["2013-01-02", "NaT", "NaT"], + ["2013-01-02", na_value, na_value], [ "2013-01-03", "2013-01-03 00:00:00-05:00", @@ -684,7 +685,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): ], ], columns=timezone_frame.columns, - dtype="object", + dtype="str", ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 875dca321635f..0354e9df3d168 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -99,6 +99,9 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string): ei = df[["a"]] tm.assert_frame_equal(ri, ei) + ri = df.select_dtypes(include=[str]) + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { @@ -358,7 +361,7 @@ def test_select_dtypes_datetime_with_tz(self): @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string): - if using_infer_string and dtype == "str": + if using_infer_string and (dtype == "str" or dtype is str): # this is tested below pytest.skip("Selecting string columns works with future strings") df = DataFrame( diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index f70d36d110625..fd770b368c9da 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -24,7 +24,6 @@ from pandas._config import using_string_dtype from pandas._libs import lib -from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -83,7 +82,7 @@ def test_constructor_from_ndarray_with_str_dtype(self): # with an array of strings each of which is e.g. "[0 1 2]" arr = np.arange(12).reshape(4, 3) df = DataFrame(arr, dtype=str) - expected = DataFrame(arr.astype(str), dtype=object) + expected = DataFrame(arr.astype(str), dtype="str") tm.assert_frame_equal(df, expected) def test_constructor_from_2d_datetimearray(self, using_array_manager): @@ -328,19 +327,39 @@ def test_constructor_dtype_nocast_view_2d_array( assert df2._mgr.arrays[0].flags.c_contiguous @td.skip_array_manager_invalid_test - @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="conversion copies") - def test_1d_object_array_does_not_copy(self): + def test_1d_object_array_does_not_copy(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") df = DataFrame(arr, copy=False) + if using_infer_string: + if df[0].dtype.storage == "pyarrow": + # object dtype strings are converted to arrow memory, + # no numpy arrays to compare + pass + else: + assert np.shares_memory(df[0].to_numpy(), arr) + else: + assert np.shares_memory(df.values, arr) + + df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) @td.skip_array_manager_invalid_test - @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") - def test_2d_object_array_does_not_copy(self): + def test_2d_object_array_does_not_copy(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") df = DataFrame(arr, copy=False) + if using_infer_string: + if df[0].dtype.storage == "pyarrow": + # object dtype strings are converted to arrow memory, + # no numpy arrays to compare + pass + else: + assert np.shares_memory(df[0].to_numpy(), arr) + else: + assert np.shares_memory(df.values, arr) + + df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) def test_constructor_dtype_list_data(self): @@ -1793,12 +1812,18 @@ def test_constructor_column_duplicates(self): tm.assert_frame_equal(idf, edf) - def test_constructor_empty_with_string_dtype(self): + def test_constructor_empty_with_string_dtype(self, using_infer_string): # GH 9428 expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object) + expected_str = DataFrame( + index=[0, 1], columns=[0, 1], dtype=pd.StringDtype(na_value=np.nan) + ) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str) - tm.assert_frame_equal(df, expected) + if using_infer_string: + tm.assert_frame_equal(df, expected_str) + else: + tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_) tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5") diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index c0bc6601769b1..a9bcae625e494 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -102,13 +102,16 @@ def test_astype_tznaive_to_tzaware(self): # dt64->dt64tz deprecated idx._data.astype("datetime64[ns, US/Eastern]") - def test_astype_str_nat(self): + def test_astype_str_nat(self, using_infer_string): # GH 13149, GH 13209 # verify that we are returning NaT as a string (and not unicode) idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan]) result = idx.astype(str) - expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) + if using_infer_string: + expected = Index(["2016-05-16", None, None, None], dtype="str") + else: + expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) tm.assert_index_equal(result, expected) def test_astype_str(self): @@ -118,7 +121,7 @@ def test_astype_str(self): expected = Index( ["2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04"], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -133,7 +136,7 @@ def test_astype_str_tz_and_name(self): "2012-01-03 00:00:00-05:00", ], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -144,7 +147,7 @@ def test_astype_str_freq_and_name(self): expected = Index( ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -156,7 +159,7 @@ def test_astype_str_freq_and_tz(self): result = dti.astype(str) expected = Index( ["2012-03-06 00:00:00+00:00", "2012-03-06 01:00:00+00:00"], - dtype=object, + dtype="str", name="test_name", ) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py index 9c1ef302c5b51..ce05b5e9f2238 100644 --- a/pandas/tests/indexes/object/test_astype.py +++ b/pandas/tests/indexes/object/test_astype.py @@ -15,12 +15,12 @@ def test_astype_str_from_bytes(): # ensure_string_array which does f"{val}" idx = Index(["あ", b"a"], dtype="object") result = idx.astype(str) - expected = Index(["あ", "a"], dtype="object") + expected = Index(["あ", "a"], dtype="str") tm.assert_index_equal(result, expected) # while we're here, check that Series.astype behaves the same result = Series(idx).astype(str) - expected = Series(expected, dtype=object) + expected = Series(expected, dtype="str") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index d545bfd2fae0f..af3c2667f51b4 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -22,7 +22,7 @@ def test_astype_raises(self, dtype): with pytest.raises(TypeError, match=msg): idx.astype(dtype) - def test_astype_conversion(self): + def test_astype_conversion(self, using_infer_string): # GH#13149, GH#13209 idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D", name="idx") @@ -41,7 +41,12 @@ def test_astype_conversion(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype=object) + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) idx = period_range("1990", "2009", freq="Y", name="idx") diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py index 311f2b5c9aa59..5166cadae499e 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_astype.py +++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -44,7 +44,7 @@ def test_astype_object_with_nat(self): tm.assert_index_equal(result, expected) assert idx.tolist() == expected_list - def test_astype(self): + def test_astype(self, using_infer_string): # GH 13149, GH 13209 idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan], name="idx") @@ -61,7 +61,12 @@ def test_astype(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype=object) + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) rng = timedelta_range("1 days", periods=10) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index b3af8def191ec..ef94c4c7aff2c 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -412,6 +412,7 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: pd.api.interchange.from_dataframe(df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_empty_string_column(): # https://github.com/pandas-dev/pandas/issues/56703 df = pd.DataFrame({"a": []}, dtype=str) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 8dc76d8f747cb..3c5e1e1cf5afb 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -550,7 +550,7 @@ def test_reader_dtype(self, read_ext): expected["a"] = expected["a"].astype("float64") expected["b"] = expected["b"].astype("float32") - expected["c"] = Series(["001", "002", "003", "004"], dtype=object) + expected["c"] = Series(["001", "002", "003", "004"], dtype="str") tm.assert_frame_equal(actual, expected) msg = "Unable to convert column d to type int64" @@ -577,9 +577,9 @@ def test_reader_dtype(self, read_ext): { "a": Series([1, 2, 3, 4], dtype="float64"), "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), - "c": Series(["001", "002", "003", "004"], dtype=object), - "d": Series(["1", "2", np.nan, "4"], dtype=object), - } + "c": Series(["001", "002", "003", "004"], dtype="str"), + "d": Series(["1", "2", np.nan, "4"], dtype="str"), + }, ), ), ], diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index bc7b21baaeec5..787941c5d0376 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -28,7 +28,7 @@ @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @pytest.mark.usefixtures("pyarrow_xfail") -def test_dtype_all_columns(all_parsers, dtype, check_orig): +def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string): # see gh-3795, gh-6607 parser = all_parsers @@ -46,8 +46,10 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): if check_orig: expected = df.copy() result = result.astype(float) - else: + elif using_infer_string and dtype is str: expected = df.astype(str) + else: + expected = df.astype(str).astype(object) tm.assert_frame_equal(result, expected) @@ -300,7 +302,6 @@ def test_true_values_cast_to_bool(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): @@ -316,7 +317,6 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_mangle_dup_cols_single_dtype(all_parsers): # GH#42022 @@ -565,7 +565,7 @@ def test_string_inference(all_parsers): @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) -def test_string_inference_object_dtype(all_parsers, dtype): +def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): # GH#56047 data = """a,b x,a @@ -575,10 +575,11 @@ def test_string_inference_object_dtype(all_parsers, dtype): with pd.option_context("future.infer_string", True): result = parser.read_csv(StringIO(data), dtype=dtype) + expected_dtype = pd.StringDtype(na_value=np.nan) if dtype is str else object expected = DataFrame( { - "a": pd.Series(["x", "y", "z"], dtype=object), - "b": pd.Series(["a", "a", "a"], dtype=object), + "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), + "b": pd.Series(["a", "a", "a"], dtype=expected_dtype), }, columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) @@ -589,7 +590,7 @@ def test_string_inference_object_dtype(all_parsers, dtype): expected = DataFrame( { - "a": pd.Series(["x", "y", "z"], dtype=object), + "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), "b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)), }, columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 1a3b7b37bf66b..5f9823f7225f9 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -630,7 +630,6 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # mismatched shape @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): @@ -682,7 +681,6 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): # TODO: this test isn't about the na_values keyword, it is about the empty entries # being returned with NaN entries, whereas the pyarrow engine returns "nan" @xfail_pyarrow # mismatched shapes -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 9e7530906afa3..5f2ddf7de9c6d 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -17,8 +17,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( ParserError, ParserWarning, @@ -498,7 +496,6 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}] ) @@ -523,10 +520,11 @@ def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, d "c": [0, 4000, 131], } ) + if dtype["a"] == object: + expected["a"] = expected["a"].astype(object) tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype,expected", [ diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index ef0757ffe4aa8..b9ba03d1e9f41 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -76,7 +76,7 @@ def test_astype_dict_like(self, dtype_class): dt1 = dtype_class({"abc": str}) result = ser.astype(dt1) - expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object) + expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype="str") tm.assert_series_equal(result, expected) dt2 = dtype_class({"abc": "float64"}) @@ -172,10 +172,14 @@ def test_astype_empty_constructor_equality(self, dtype): ) def test_astype_str_map(self, dtype, series, using_infer_string): # see GH#4405 + using_string_dtype = using_infer_string and dtype is str result = series.astype(dtype) - expected = series.map(str) - if using_infer_string: - expected = expected.astype(object) + if using_string_dtype: + expected = series.map(lambda val: str(val) if val is not np.nan else np.nan) + else: + expected = series.map(str) + if using_infer_string: + expected = expected.astype(object) tm.assert_series_equal(result, expected) def test_astype_float_to_period(self): @@ -212,7 +216,7 @@ def test_astype_dt64_to_str(self): # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex dti = date_range("2012-01-01", periods=3) result = Series(dti).astype(str) - expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object) + expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype="str") tm.assert_series_equal(result, expected) def test_astype_dt64tz_to_str(self): @@ -225,7 +229,7 @@ def test_astype_dt64tz_to_str(self): "2012-01-02 00:00:00-05:00", "2012-01-03 00:00:00-05:00", ], - dtype=object, + dtype="str", ) tm.assert_series_equal(result, expected) @@ -285,13 +289,13 @@ def test_astype_str_cast_dt64(self): ts = Series([Timestamp("2010-01-04 00:00:00")]) res = ts.astype(str) - expected = Series(["2010-01-04"], dtype=object) + expected = Series(["2010-01-04"], dtype="str") tm.assert_series_equal(res, expected) ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) res = ts.astype(str) - expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object) + expected = Series(["2010-01-04 00:00:00-05:00"], dtype="str") tm.assert_series_equal(res, expected) def test_astype_str_cast_td64(self): @@ -300,7 +304,7 @@ def test_astype_str_cast_td64(self): td = Series([Timedelta(1, unit="d")]) ser = td.astype(str) - expected = Series(["1 days"], dtype=object) + expected = Series(["1 days"], dtype="str") tm.assert_series_equal(ser, expected) def test_dt64_series_astype_object(self): @@ -347,7 +351,7 @@ def test_astype_from_float_to_str(self, dtype): # https://github.com/pandas-dev/pandas/issues/36451 ser = Series([0.1], dtype=dtype) result = ser.astype(str) - expected = Series(["0.1"], dtype=object) + expected = Series(["0.1"], dtype="str") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -358,11 +362,13 @@ def test_astype_from_float_to_str(self, dtype): (NA, ""), ], ) - def test_astype_to_str_preserves_na(self, value, string_value): + def test_astype_to_str_preserves_na(self, value, string_value, using_infer_string): # https://github.com/pandas-dev/pandas/issues/36904 ser = Series(["a", "b", value], dtype=object) result = ser.astype(str) - expected = Series(["a", "b", string_value], dtype=object) + expected = Series( + ["a", "b", None if using_infer_string else string_value], dtype="str" + ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"]) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index ac489b2579e05..e5281a18236da 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -553,13 +553,11 @@ def f(x): (list(range(3)), {0: 42}, [42] + [np.nan] * 3), ], ) -def test_map_missing_mixed(vals, mapping, exp, using_infer_string): +def test_map_missing_mixed(vals, mapping, exp): # GH20495 s = Series(vals + [np.nan]) result = s.map(mapping) exp = Series(exp) - if using_infer_string and mapping == {np.nan: "not NaN"}: - exp.iloc[-1] = np.nan tm.assert_series_equal(result, exp) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 6efe0bcb8b45d..60b2ec7b6912d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -230,7 +230,7 @@ def test_constructor_empty(self, input_class, using_infer_string): # GH 19853 : with empty string, index and dtype str empty = Series("", dtype=str, index=range(3)) if using_infer_string: - empty2 = Series("", index=range(3), dtype=object) + empty2 = Series("", index=range(3), dtype="str") else: empty2 = Series("", index=range(3)) tm.assert_series_equal(empty, empty2) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index f3a7ba2607f4a..a7c2ec5acb7c2 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1900,13 +1900,16 @@ def test_strobj_mode(self): tm.assert_series_equal(ser.mode(), exp) @pytest.mark.parametrize("dt", [str, object]) - def test_strobj_multi_char(self, dt): + def test_strobj_multi_char(self, dt, using_infer_string): exp = ["bar"] data = ["foo"] * 2 + ["bar"] * 3 ser = Series(data, dtype=dt) exp = Series(exp, dtype=dt) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + if using_infer_string and dt is str: + tm.assert_extension_array_equal(algos.mode(ser.values), exp.values) + else: + tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) tm.assert_series_equal(ser.mode(), exp) def test_datelike_mode(self): From 53ac22481523dd4106a6f198f137262acdeb23f8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 25 Sep 2024 21:16:04 +0200 Subject: [PATCH 31/34] String dtype: allow string dtype for non-raw apply with numba engine (#59854) * String dtype: allow string dtype for non-raw apply with numba engine * remove xfails * clean-up --- pandas/core/_numba/extensions.py | 3 ++- pandas/core/apply.py | 5 ----- pandas/tests/apply/test_frame_apply.py | 1 - pandas/tests/apply/test_numba.py | 4 ---- 4 files changed, 2 insertions(+), 11 deletions(-) diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py index ee09c9380fb0f..b05f12295a729 100644 --- a/pandas/core/_numba/extensions.py +++ b/pandas/core/_numba/extensions.py @@ -49,7 +49,8 @@ @contextmanager def set_numba_data(index: Index): numba_data = index._data - if numba_data.dtype == object: + if numba_data.dtype in (object, "string"): + numba_data = np.asarray(numba_data) if not lib.is_string_array(numba_data): raise ValueError( "The numba engine only supports using string or numeric column names" diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 25a71ce5b5f4f..fafc9ee1b6928 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1174,12 +1174,7 @@ def apply_with_numba(self) -> dict[int, Any]: from pandas.core._numba.extensions import set_numba_data index = self.obj.index - if index.dtype == "string": - index = index.astype(object) - columns = self.obj.columns - if columns.dtype == "string": - columns = columns.astype(object) # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 5e0f991d5c406..6a328dfb39be5 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -65,7 +65,6 @@ def test_apply(float_frame, engine, request): assert result.index is float_frame.index -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("raw", [True, False]) def test_apply_args(float_frame, axis, raw, engine, request): diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 83b655f89e247..20c067a776f4d 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -20,7 +18,6 @@ def apply_axis(request): return request.param -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_noop(float_frame, apply_axis): func = lambda x: x result = float_frame.apply(func, engine="numba", axis=apply_axis) @@ -43,7 +40,6 @@ def test_numba_vs_python_string_index(): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_indexing(): frame = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, From ed78032dc1552d40aabf0bcb1fa74cf8c3b03063 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 10 Oct 2024 12:53:16 +0200 Subject: [PATCH 32/34] fixup rank test --- pandas/tests/series/methods/test_rank.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index f0fe1d989941e..1c3ebe5653ce3 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -261,7 +261,11 @@ def test_rank_signature(self): def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): method, exp = results - if dtype == "int64" or (not using_infer_string and dtype == "str"): + if ( + dtype == "int64" + or dtype == "Int64" + or (not using_infer_string and dtype == "str") + ): pytest.skip("int64/str does not support NaN") ser = ser if dtype is None else ser.astype(dtype) From 581582bcdef30ec91429381a835dfbb29bdc9ac8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 10 Oct 2024 13:24:21 +0200 Subject: [PATCH 33/34] update tests --- pandas/tests/arithmetic/test_object.py | 4 ++++ pandas/tests/groupby/methods/test_describe.py | 6 +++++- pandas/tests/groupby/test_numeric_only.py | 1 + pandas/tests/groupby/transform/test_transform.py | 4 ++-- pandas/tests/indexing/test_iloc.py | 4 +++- pandas/tests/indexing/test_loc.py | 4 ---- pandas/tests/io/test_feather.py | 3 --- pandas/tests/reshape/test_melt.py | 5 ++++- 8 files changed, 19 insertions(+), 12 deletions(-) diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index bc0f78d3aa01a..44e485d40ba53 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -183,6 +183,10 @@ def test_objarr_add_invalid(self, op, box_with_array): "unsupported operand type", "must be str", "has no kernel", + "operation 'add' not supported", + "operation 'radd' not supported", + "operation 'sub' not supported", + "operation 'rsub' not supported", ] ) with pytest.raises(Exception, match=msg): diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py index c80063e673b81..c0889ab415e74 100644 --- a/pandas/tests/groupby/methods/test_describe.py +++ b/pandas/tests/groupby/methods/test_describe.py @@ -71,7 +71,7 @@ def test_series_describe_as_index(as_index, keys): tm.assert_frame_equal(result, expected) -def test_frame_describe_multikey(tsframe): +def test_frame_describe_multikey(tsframe, using_infer_string): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() desc_groups = [] @@ -87,6 +87,10 @@ def test_frame_describe_multikey(tsframe): expected = pd.concat(desc_groups, axis=1) tm.assert_frame_equal(result, expected) + # remainder of the tests fails with string dtype but is testing deprecated behaviour + if using_infer_string: + return + msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index b1fa541d42086..3b7614347d181 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -273,6 +273,7 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_str # cumsum, diff, pct_change "unsupported operand type", "has no kernel", + "operation 'sub' not supported for dtype 'str' with dtype 'float64'", ) if using_infer_string: pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 5823656a610e5..395036dd400e5 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -513,7 +513,7 @@ def test_transform_nuisance_raises(df, using_infer_string): msg = "Could not convert" if using_infer_string: if df.columns.dtype.storage == "pyarrow": - msg = "with dtype str does not support operation 'mean'" + msg = "with dtype str does not support reduction 'mean'" else: msg = "Cannot perform reduction 'mean' with string dtype" with pytest.raises(TypeError, match=msg): @@ -621,7 +621,7 @@ def test_groupby_transform_with_int(using_infer_string): msg = "Could not convert" if using_infer_string: if HAS_PYARROW: - msg = "with dtype str does not support operation 'mean'" + msg = "with dtype str does not support reduction 'mean'" else: msg = "Cannot perform reduction 'mean' with string dtype" with np.errstate(all="ignore"): diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 45f63bdf1ee32..c2742f42e3a92 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -1221,7 +1221,9 @@ def test_iloc_setitem_multicolumn_to_datetime(self, using_infer_string): df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]}) if using_infer_string: - with pytest.raises(TypeError, match="Invalid value"): + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) else: df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index ad72be02f81b1..bdc6d9aff6f4e 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -16,7 +16,6 @@ from pandas._config import using_string_dtype from pandas._libs import index as libindex -from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IndexingError import pandas.util._test_decorators as td @@ -1459,9 +1458,6 @@ def test_loc_setitem_listlike_with_timedelta64index(self, indexer, expected): tm.assert_frame_equal(expected, df) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_loc_setitem_categorical_values_partial_column_slice(self): # Assigning a Category to parts of a int/... column uses the values of # the Categorical diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 24fc801de44a7..3b4484e44e155 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -2,8 +2,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm @@ -148,7 +146,6 @@ def test_path_localpath(self): result = tm.round_trip_localpath(df.to_feather, read_feather) tm.assert_frame_equal(df, result) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_passthrough_keywords(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index e58187ba6bcbc..72fd72df60761 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -1199,7 +1199,10 @@ def test_raise_of_column_name_value(self): ): df.melt(id_vars="value", value_name="value") - def test_missing_stubname(self, any_string_dtype): + def test_missing_stubname(self, request, any_string_dtype, using_infer_string): + if using_infer_string and any_string_dtype == "object": + # triggers object dtype inference warning of dtype=object + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) # GH46044 df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]}) df = df.astype({"id": any_string_dtype}) From 1d1e3da825a3c9703b6b818cc1debcab3dbeade5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 10 Oct 2024 13:29:11 +0200 Subject: [PATCH 34/34] fix linting --- pandas/core/dtypes/dtypes.py | 2 +- pandas/tests/extension/test_string.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index c6ca24d19b906..e7efb8598ec61 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1791,7 +1791,7 @@ def _is_na_fill_value(self) -> bool: @property def _is_numeric(self) -> bool: - return not self.subtype == object + return self.subtype != object @property def _is_boolean(self) -> bool: diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 354b4d5333c7d..07c3b4224e76f 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -212,9 +212,6 @@ def test_compare_scalar(self, data, comparison_op): ser = pd.Series(data) self._compare_other(ser, data, comparison_op, "abc") - def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): - super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) - def test_combine_add(self, data_repeated, using_infer_string, request): dtype = next(data_repeated(1)).dtype if using_infer_string and (