Skip to content

Commit

Permalink
apacheGH-41978: [Python] Fix pandas tests to follow downstream dateti…
Browse files Browse the repository at this point in the history
…me64 unit changes
  • Loading branch information
jorisvandenbossche committed Jun 5, 2024
1 parent 24054ef commit 68aa04d
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 7 deletions.
11 changes: 8 additions & 3 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,10 @@ def get_logical_type_from_numpy(pandas_collection):
except KeyError:
if hasattr(pandas_collection.dtype, 'tz'):
return 'datetimetz'
# See https://github.com/pandas-dev/pandas/issues/24739
if str(pandas_collection.dtype) == 'datetime64[ns]':
return 'datetime64[ns]'
# See https://github.com/pandas-dev/pandas/issues/24739 (infer_dtype will
# result in "datetime64" without unit, while pandas astype requires a unit)
if str(pandas_collection.dtype).startswith('datetime64'):
return str(pandas_collection.dtype)
result = _pandas_api.infer_dtype(pandas_collection)
if result == 'string':
return 'unicode'
Expand Down Expand Up @@ -1105,6 +1106,10 @@ def _reconstruct_columns_from_metadata(columns, column_indexes):
tz = pa.lib.string_to_tzinfo(
column_indexes[0]['metadata']['timezone'])
level = pd.to_datetime(level, utc=True).tz_convert(tz)
if _pandas_api.is_ge_v3():
# with pandas 3+, to_datetime returns a unit depending on the string
# data, so we restore it to the original unit from the metadata
level = level.as_unit(np.datetime_data(dtype)[0])
elif level.dtype != dtype:
level = level.astype(dtype)
# ARROW-9096: if original DataFrame was upcast we keep that
Expand Down
6 changes: 4 additions & 2 deletions python/pyarrow/tests/interchange/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,10 @@ def test_pandas_to_pyarrow_with_missing(np_float):
np_array = np.array([0, np.nan, 2], dtype=np_float)
datetime_array = [None, dt(2007, 7, 14), dt(2007, 7, 15)]
df = pd.DataFrame({
"a": np_array, # float, ColumnNullType.USE_NAN
"dt": datetime_array # ColumnNullType.USE_SENTINEL
# float, ColumnNullType.USE_NAN
"a": np_array,
# ColumnNullType.USE_SENTINEL
"dt": np.array(datetime_array, dtype="datetime64[ns]")
})
expected = pa.table({
"a": pa.array(np_array, from_pandas=True),
Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/tests/parquet/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,7 @@ def get_table(pq_reader_method, filename, **kwargs):
pq_reader_method, filename, coerce_int96_timestamp_unit="s"
)
df_correct = tab_correct.to_pandas(timestamp_as_object=True)
df["a"] = df["a"].astype(object)
tm.assert_frame_equal(df, df_correct)


Expand Down
6 changes: 4 additions & 2 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4730,21 +4730,23 @@ def make_df_with_timestamps():
# Some of the milliseconds timestamps deliberately don't fit in the range
# that is possible with nanosecond timestamps.
df = pd.DataFrame({
'dateTimeMs': [
'dateTimeMs': np.array([
np.datetime64('0001-01-01 00:00', 'ms'),
np.datetime64('2012-05-02 12:35', 'ms'),
np.datetime64('2012-05-03 15:42', 'ms'),
np.datetime64('3000-05-03 15:42', 'ms'),
],
], dtype=object),
'dateTimeNs': [
np.datetime64('1991-01-01 00:00', 'ns'),
np.datetime64('2012-05-02 12:35', 'ns'),
np.datetime64('2012-05-03 15:42', 'ns'),
np.datetime64('2050-05-03 15:42', 'ns'),
],
})
df['dateTimeMs'] = df['dateTimeMs'].astype('object')
# Not part of what we're testing, just ensuring that the inputs are what we
# expect.
# if Version(pd.__version__) < Version("3.0.0.dev0"):
assert (df.dateTimeMs.dtype, df.dateTimeNs.dtype) == (
# O == object, M8[ns] == timestamp64[ns]
np.dtype("O"), np.dtype("M8[ns]")
Expand Down

0 comments on commit 68aa04d

Please sign in to comment.