Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -2276,6 +2276,13 @@ cdef _array_like_to_pandas(obj, options, types_mapper):
dtype = original_type.to_pandas_dtype()
except NotImplementedError:
pass
elif pandas_api.uses_string_dtype() and not options["strings_to_categorical"] and (
original_type.id == _Type_STRING or
original_type.id == _Type_LARGE_STRING or
original_type.id == _Type_STRING_VIEW
):
# for pandas 3.0+, use pandas' new default string dtype
dtype = pandas_api.pd.StringDtype(na_value=np.nan)

# Only call __from_arrow__ for Arrow extension types or when explicitly
# overridden via types_mapper
Expand Down
48 changes: 45 additions & 3 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2975,7 +2975,9 @@ def check_zero_copy_failure(self, arr):
arr.to_pandas(zero_copy_only=True)

def test_zero_copy_failure_on_object_types(self):
self.check_zero_copy_failure(pa.array(['A', 'B', 'C']))
if Version(pd.__version__) < Version("3.0.0"):
# pandas 3.0 includes default string dtype support
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure I understand why this test has to have this guard now. Isn't it supposed to work with pandas > 3.0.0?
I suppose this is because we are testing object types specifically. Was this test failing on CI? I haven't seen the failure.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be connected to the change I made in this PR as strings are not converted to pandas object anymore. But looking at the test it might be a leftover from my previous wrong approach. Thanks for the comment, I need to check this!

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, got it. This test checks that strings can not be zero copied to Pandas. Which has been true in the past as the C++ machinery constructed an object type from Pyarrow string type. Now, with pandas 3.0.0 we can move through __from_arrow__ where no copies are needed.

Running this test locally with pandas 3.0.0 gives following error:
______________________________________________ TestZeroCopyConversion.test_zero_copy_failure_on_object_types _______________________________________________

self = <pyarrow.tests.test_pandas.TestZeroCopyConversion object at 0x156a0af90>

    def test_zero_copy_failure_on_object_types(self):
>       self.check_zero_copy_failure(pa.array(['A', 'B', 'C']))

python/pyarrow/tests/test_pandas.py:2978: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <pyarrow.tests.test_pandas.TestZeroCopyConversion object at 0x156a0af90>
arr = <pyarrow.lib.StringArray object at 0x15699b700>
[
  "A",
  "B",
  "C"
]

    def check_zero_copy_failure(self, arr):
>       with pytest.raises(pa.ArrowInvalid):
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
E       Failed: DID NOT RAISE <class 'pyarrow.lib.ArrowInvalid'>

python/pyarrow/tests/test_pandas.py:2974: Failed

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, from what I can see this is an expected change, since string conversion will now actually be zero copy

(although, strictly speaking, it is not actually zero-copy entirely, because the test here is using string, and pandas will convert that to large_string. But I suppose that happens outside the view of pyarrow)

Copy link
Copy Markdown
Member

@jorisvandenbossche jorisvandenbossche Apr 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Essentially, the zero_copy_only keyword is ignored whenever the conversion goes through dtype.__from_arrow__ .. (same for other options), so it is not even about no longer making a copy or not in pandas 3.0, just about using an ExtensionDtype

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yes, I see. Should this be changed when dealing with Extension types? I know we have a list of things to work on when it comes to this topic and we can open up an umbrella issue with all possible improvements.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure how to easily improve this .. (since we defer to pandas for the conversion, and that method we call does not have those keywords)

(long term I would like to see this logic to be moved entirely to pandas)

self.check_zero_copy_failure(pa.array(['A', 'B', 'C']))

def test_zero_copy_failure_with_int_when_nulls(self):
self.check_zero_copy_failure(pa.array([0, 1, None]))
Expand Down Expand Up @@ -3047,6 +3049,10 @@ def test_all_none_category(self):

def test_empty_arrays(self):
for dtype_str, pa_type in self.type_pairs:
if (Version(pd.__version__) >= Version("3.0.0") and
pa_type == pa.string()):
# PyArrow backed string dtype are set by default
dtype_str = 'str'
Comment thread
jorisvandenbossche marked this conversation as resolved.
arr = np.array([], dtype=np.dtype(dtype_str))
_check_array_roundtrip(arr, type=pa_type)

Expand Down Expand Up @@ -3231,13 +3237,19 @@ def test_convert_empty_table(self):
empty_objects = pd.Series(np.array([], dtype=object))
tm.assert_series_equal(arr.to_pandas(),
pd.Series(np.array([], dtype=np.int64)))
arr = pa.array([], type=pa.string())
tm.assert_series_equal(arr.to_pandas(), empty_objects)
arr = pa.array([], type=pa.list_(pa.int64()))
tm.assert_series_equal(arr.to_pandas(), empty_objects)
arr = pa.array([], type=pa.struct([pa.field('a', pa.int64())]))
tm.assert_series_equal(arr.to_pandas(), empty_objects)

arr = pa.array([], type=pa.string())
if Version(pd.__version__) >= Version("3.0.0"):
# PyArrow backed string dtype are set by default
empty_str = pd.Series([], dtype=str)
tm.assert_series_equal(arr.to_pandas(), empty_str)
else:
tm.assert_series_equal(arr.to_pandas(), empty_objects)

def test_non_natural_stride(self):
"""
ARROW-2172: converting from a Numpy array with a stride that's
Expand Down Expand Up @@ -4652,6 +4664,36 @@ def test_chunked_array_to_pandas_types_mapper():
assert result.dtype == np.dtype("int64")


@pytest.mark.parametrize(
"string_type", [pa.string(), pa.large_string(), pa.string_view()]
)
@pytest.mark.parametrize("data", [[], [None]])
def test_array_to_pandas_string_dtype(string_type, data):
# GH-49002
if Version(pd.__version__) < Version("3.0.0"):
pytest.skip("PyArrow backed string dtype missing")

arr = pa.array(data, type=string_type)
result = arr.to_pandas()
assert result.dtype == pd.StringDtype(na_value=np.nan)

arr = pa.chunked_array([data], type=string_type)
result = arr.to_pandas()
assert result.dtype == pd.StringDtype(na_value=np.nan)

# Test types_mapper takes precedence
types_mapper = {string_type: None}.get
result = arr.to_pandas(types_mapper=types_mapper)
assert result.dtype == np.dtype("object")

# Test strings_to_categorical
result = arr.to_pandas(strings_to_categorical=False)
assert result.dtype == pd.StringDtype(na_value=np.nan)
result = arr.to_pandas(strings_to_categorical=True)
assert result.dtype == pd.CategoricalDtype(categories=[],
ordered=False)


# ----------------------------------------------------------------------
# Legacy metadata compatibility tests

Expand Down
Loading