From dcf9a3a9669634281e55548856801908be8e90fa Mon Sep 17 00:00:00 2001 From: "Ankit.Ahlawat@ibm.com" Date: Tue, 28 Apr 2026 10:59:04 +0530 Subject: [PATCH 1/4] GH-49875:[Python] Fix timezone dropped when converting tz-aware Categorical to Arrow array --- python/pyarrow/array.pxi | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index b7f3a46f9e14..ecdbb342d3e2 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -356,8 +356,8 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, values.codes, mask, index_type, memory_pool) try: dictionary = array( - values.categories.values, type=value_type, - memory_pool=memory_pool) + values.categories, type=value_type, + from_pandas=True, memory_pool=memory_pool) except TypeError: # TODO when removing the deprecation warning, this whole # try/except can be removed (to bubble the TypeError of @@ -371,7 +371,8 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, "TypeError", FutureWarning, stacklevel=2) dictionary = array( - values.categories.values, memory_pool=memory_pool) + values.categories, from_pandas=True, + memory_pool=memory_pool) else: raise From fb275d0aec3bfc31dcfd12f1a10344d04d477310 Mon Sep 17 00:00:00 2001 From: "Ankit.Ahlawat@ibm.com" Date: Mon, 4 May 2026 17:32:11 +0530 Subject: [PATCH 2/4] GH-49875: [Python] Add test for tz-aware Categorical timezone preservation --- python/pyarrow/tests/test_pandas.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 0339975f4571..89bedf42b34e 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -492,6 +492,22 @@ def test_categorical_row_index(self): _check_pandas_roundtrip(df, preserve_index=True) + def test_categorical_with_timezone(self): + # pandas Categorical with timezone-aware datetime categories + # GH-49875: timezone was dropped when converting tz-aware categorical + cats = pd.DatetimeIndex(["2024-01-01", "2024-01-02"]).tz_localize("US/Eastern") + cat = pd.Categorical(values=[cats[0], cats[1], cats[0]], categories=cats) + + # Verify pandas keeps the timezone on categories + assert str(cat.dtype.categories.dtype) == "datetime64[us, US/Eastern]" + + # Convert to PyArrow + arr = pa.array(cat, from_pandas=True) + + # Verify timezone is preserved in the dictionary value type + assert arr.type.value_type.tz is not None + assert str(arr.type.value_type.tz) == "US/Eastern" + def test_duplicate_column_names_does_not_crash(self): df = pd.DataFrame([(1, 'a'), (2, 'b')], columns=list('aa')) with pytest.raises(ValueError): From aa65de7e8d6c410584f625ea9cf07d8d8e140564 Mon Sep 17 00:00:00 2001 From: "Ankit.Ahlawat@ibm.com" Date: Mon, 4 May 2026 19:13:41 +0530 Subject: [PATCH 3/4] GH-49875: [Python] Fix test to handle both ns and us datetime resolution --- python/pyarrow/tests/test_pandas.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 89bedf42b34e..bd7b6599a068 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -497,13 +497,13 @@ def test_categorical_with_timezone(self): # GH-49875: timezone was dropped when converting tz-aware categorical cats = pd.DatetimeIndex(["2024-01-01", "2024-01-02"]).tz_localize("US/Eastern") cat = pd.Categorical(values=[cats[0], cats[1], cats[0]], categories=cats) - + # Verify pandas keeps the timezone on categories - assert str(cat.dtype.categories.dtype) == "datetime64[us, US/Eastern]" - + assert "US/Eastern" in str(cat.dtype.categories.dtype) + # Convert to PyArrow arr = pa.array(cat, from_pandas=True) - + # Verify timezone is preserved in the dictionary value type assert arr.type.value_type.tz is not None assert str(arr.type.value_type.tz) == "US/Eastern" From 0f1f595173e3c3ecf53c3f482aee3e0c77970a83 Mon Sep 17 00:00:00 2001 From: "Ankit.Ahlawat@ibm.com" Date: Mon, 4 May 2026 21:03:39 +0530 Subject: [PATCH 4/4] GH-49875: [Python] Update the test case location as per the review comments --- python/pyarrow/tests/test_pandas.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index bd7b6599a068..063532140c6e 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -492,22 +492,6 @@ def test_categorical_row_index(self): _check_pandas_roundtrip(df, preserve_index=True) - def test_categorical_with_timezone(self): - # pandas Categorical with timezone-aware datetime categories - # GH-49875: timezone was dropped when converting tz-aware categorical - cats = pd.DatetimeIndex(["2024-01-01", "2024-01-02"]).tz_localize("US/Eastern") - cat = pd.Categorical(values=[cats[0], cats[1], cats[0]], categories=cats) - - # Verify pandas keeps the timezone on categories - assert "US/Eastern" in str(cat.dtype.categories.dtype) - - # Convert to PyArrow - arr = pa.array(cat, from_pandas=True) - - # Verify timezone is preserved in the dictionary value type - assert arr.type.value_type.tz is not None - assert str(arr.type.value_type.tz) == "US/Eastern" - def test_duplicate_column_names_does_not_crash(self): df = pd.DataFrame([(1, 'a'), (2, 'b')], columns=list('aa')) with pytest.raises(ValueError): @@ -3063,6 +3047,15 @@ def test_all_none_category(self): df['a'] = df['a'].astype('category') _check_pandas_roundtrip(df) + def test_categorical_with_timezone(self): + # GH-49875: timezone was dropped when converting tz-aware categorical + cats = pd.DatetimeIndex(["2024-01-01", "2024-01-02"]).tz_localize("US/Eastern") + cat = pd.Categorical(values=[cats[0], cats[1], cats[0]], categories=cats) + + arr = pa.array(cat, from_pandas=True) + + assert arr.type.value_type.tz == "US/Eastern" + def test_empty_arrays(self): for dtype_str, pa_type in self.type_pairs: if (Version(pd.__version__) >= Version("3.0.0") and