Skip to content

Commit 0980e1c

Browse files
authored
Merge pull request #271 from lincc-frameworks/nested-df-dtypes
Propagate inner nested columns to elements
2 parents 03c3ad4 + df9b5a9 commit 0980e1c

File tree

3 files changed

+82
-21
lines changed

3 files changed

+82
-21
lines changed

src/nested_pandas/series/ext_array.py

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -176,9 +176,28 @@ def replace_with_mask(array: pa.ChunkedArray, mask: pa.BooleanArray, value: pa.A
176176
return pa.compute.if_else(mask, broadcast_value, array)
177177

178178

179-
def convert_df_to_pa_scalar(df: pd.DataFrame, *, pa_type: pa.DataType | None) -> pa.Scalar:
180-
d = {column: series.values for column, series in df.to_dict("series").items()}
181-
return pa.scalar(d, type=pa_type, from_pandas=True)
179+
def convert_df_to_pa_scalar(df: pd.DataFrame, *, pa_type: pa.StructType | None) -> pa.Scalar:
180+
d = {}
181+
types = {}
182+
columns = df.columns
183+
if pa_type is not None:
184+
names = pa_type.names
185+
columns = names + list(set(columns) - set(names))
186+
for column in columns:
187+
series = df[column]
188+
if isinstance(series.dtype, NestedDtype):
189+
scalar = series.array.to_pyarrow_scalar(list_struct=True)
190+
ty = scalar.type
191+
else:
192+
array = pa.array(series)
193+
ty = pa.list_(array.type)
194+
scalar = pa.scalar(array, type=ty)
195+
d[column] = scalar
196+
types[column] = ty
197+
result = pa.scalar(d, type=pa.struct(types), from_pandas=True)
198+
if pa_type is not None:
199+
result = result.cast(pa_type)
200+
return result
182201

183202

184203
class NestedExtensionArray(ExtensionArray):
@@ -686,17 +705,37 @@ def _from_arrow_like(cls, arraylike, dtype: NestedDtype | None = None) -> Self:
686705
raise ValueError(f"Cannot cast input to {dtype}") from None
687706
return cls(cast_array)
688707

689-
@classmethod
690-
def _convert_struct_scalar_to_df(cls, value: pa.StructScalar, *, copy: bool, na_value: Any = None) -> Any:
708+
def _convert_struct_scalar_to_df(
709+
self, value: pa.StructScalar, *, copy: bool, na_value: Any = None, pyarrow_dtypes: bool = False
710+
) -> Any:
691711
"""Converts a struct scalar of equal-length list scalars to a pd.DataFrame
692712
693713
No validation is done, so the input must be a struct scalar with all fields being list scalars
694714
of the same lengths.
715+
716+
Parameters
717+
----------
718+
value : pa.StructScalar
719+
The struct scalar to convert.
720+
copy : bool
721+
Whether to copy the data.
722+
na_value : Any, optional
723+
The value to use for nulls.
724+
pyarrow_dtypes : bool, optional
725+
Whether to use pd.ArrowDtype. Nested fields will always
726+
have NestedDtype.
695727
"""
696728
if pa.compute.is_null(value).as_py():
697729
return na_value
698-
d = {name: pd.Series(list_scalar.values, copy=copy) for name, list_scalar in value.items()}
699-
return pd.DataFrame(d, copy=False)
730+
series = {}
731+
for name, list_scalar in value.items():
732+
dtype = self.dtype.field_dtype(name)
733+
# It gave pd.ArrowDtype for non-NestedDtype fields,
734+
# make it None if we'd like to use pandas "ordinary" dtypes.
735+
if not pyarrow_dtypes and not isinstance(dtype, NestedDtype):
736+
dtype = None
737+
series[name] = pd.Series(list_scalar.values, dtype=dtype, copy=copy, name=name)
738+
return pd.DataFrame(series, copy=False)
700739

701740
@property
702741
def _list_storage(self):

tests/nested_pandas/series/test_ext_array.py

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import pyarrow.compute as pc
77
import pytest
88
from nested_pandas import NestedDtype
9+
from nested_pandas.datasets import generate_data
10+
from nested_pandas.nestedframe.core import NestedFrame
911
from nested_pandas.series.ext_array import NestedExtensionArray, convert_df_to_pa_scalar, replace_with_mask
1012
from numpy.testing import assert_array_equal
1113
from pandas.core.arrays import ArrowExtensionArray
@@ -297,6 +299,17 @@ def test_convert_df_to_pa_from_scalar():
297299
)
298300

299301

302+
def test_convert_df_to_pa_scalar_from_pyarrow_dtyped_df():
303+
"""Test that we can convert a frame with pd.ArrowDtype series to pyarrow struct_scalar."""
304+
df = pd.DataFrame({"a": pd.Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int32())), "b": [-4.0, -5.0, -6.0]})
305+
pa_scalar = convert_df_to_pa_scalar(df, pa_type=None)
306+
307+
assert pa_scalar == pa.scalar(
308+
{"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]},
309+
type=pa.struct([pa.field("a", pa.list_(pa.int32())), pa.field("b", pa.list_(pa.float64()))]),
310+
)
311+
312+
300313
def test__box_pa_array_from_series_of_df():
301314
"""Test that we can convert a DataFrame to a pyarrow scalar."""
302315
series = pd.Series(
@@ -692,20 +705,17 @@ def test_list_offsets_multiple_chunks():
692705

693706

694707
def test___getitem___with_integer():
695-
"""Test [i] is a valid DataFrame."""
696-
struct_array = pa.StructArray.from_arrays(
697-
arrays=[
698-
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
699-
pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]),
700-
],
701-
names=["a", "b"],
702-
)
703-
ext_array = NestedExtensionArray(struct_array)
708+
"""Test [i] is a valid DataFrame with NestedDtype propagated"""
709+
nf = generate_data(10, 3)
710+
# repeat index 3 and nest on it
711+
nf["id"] = [0, 1, 2, 3, 3, 4, 5, 6, 7, 8]
712+
nnf = NestedFrame.from_flat(nf, base_columns=[], on="id", name="outer")
713+
ext_array = nnf["outer"].array
704714

705-
second_row_as_df = ext_array[1]
706-
assert_frame_equal(
707-
second_row_as_df, pd.DataFrame({"a": np.array([1.0, 2.0, 1.0]), "b": -np.array([3.0, 4.0, 5.0])})
708-
)
715+
actual = ext_array[3]
716+
desired = pd.DataFrame(nf.query("id == 3").drop("id", axis=1)).reset_index(drop=True)
717+
718+
assert_frame_equal(actual, desired)
709719

710720

711721
def test___getitem___with_integer_ndarray():

tests/nested_pandas/series/test_packer.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
import pandas as pd
33
import pyarrow as pa
44
import pytest
5-
from nested_pandas import NestedDtype
5+
from nested_pandas import NestedDtype, NestedFrame
6+
from nested_pandas.datasets import generate_data
67
from nested_pandas.series import packer
78
from numpy.testing import assert_array_equal
89
from pandas.testing import assert_frame_equal, assert_series_equal
@@ -441,6 +442,17 @@ def test_pack_seq_with_series_of_dfs():
441442
assert_series_equal(series, desired)
442443

443444

445+
def test_pack_seq_with_double_nested():
446+
"""Test pack_seq works nice for frames with nested columns."""
447+
nf = generate_data(10, 3)
448+
nf["id"] = [0, 0, 1, 2, 2, 3, 4, 4, 5, 5]
449+
desired = NestedFrame.from_flat(nf, base_columns=[], on="id", name="outer")["outer"].reset_index(
450+
drop=True,
451+
)
452+
actual = packer.pack_seq(list(desired))
453+
assert_frame_equal(actual.nest.to_flat(), desired.nest.to_flat())
454+
455+
444456
def test_view_sorted_df_as_list_arrays():
445457
"""Test view_sorted_df_as_list_arrays()."""
446458
flat_df = pd.DataFrame(

0 commit comments

Comments
 (0)