Merge pull request #271 from lincc-frameworks/nested-df-dtypes

hombit · web-flow · commit 0980e1c25dc7 · 2025-05-20T16:34:05.000-04:00
Propagate inner nested columns to elements
diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py
@@ -176,9 +176,28 @@ def replace_with_mask(array: pa.ChunkedArray, mask: pa.BooleanArray, value: pa.A
     return pa.compute.if_else(mask, broadcast_value, array)
 
 
-def convert_df_to_pa_scalar(df: pd.DataFrame, *, pa_type: pa.DataType | None) -> pa.Scalar:
-    d = {column: series.values for column, series in df.to_dict("series").items()}
-    return pa.scalar(d, type=pa_type, from_pandas=True)
+def convert_df_to_pa_scalar(df: pd.DataFrame, *, pa_type: pa.StructType | None) -> pa.Scalar:
+    d = {}
+    types = {}
+    columns = df.columns
+    if pa_type is not None:
+        names = pa_type.names
+        columns = names + list(set(columns) - set(names))
+    for column in columns:
+        series = df[column]
+        if isinstance(series.dtype, NestedDtype):
+            scalar = series.array.to_pyarrow_scalar(list_struct=True)
+            ty = scalar.type
+        else:
+            array = pa.array(series)
+            ty = pa.list_(array.type)
+            scalar = pa.scalar(array, type=ty)
+        d[column] = scalar
+        types[column] = ty
+    result = pa.scalar(d, type=pa.struct(types), from_pandas=True)
+    if pa_type is not None:
+        result = result.cast(pa_type)
+    return result
 
 
 class NestedExtensionArray(ExtensionArray):
@@ -686,17 +705,37 @@ def _from_arrow_like(cls, arraylike, dtype: NestedDtype | None = None) -> Self:
                 raise ValueError(f"Cannot cast input to {dtype}") from None
         return cls(cast_array)
 
-    @classmethod
-    def _convert_struct_scalar_to_df(cls, value: pa.StructScalar, *, copy: bool, na_value: Any = None) -> Any:
+    def _convert_struct_scalar_to_df(
+        self, value: pa.StructScalar, *, copy: bool, na_value: Any = None, pyarrow_dtypes: bool = False
+    ) -> Any:
         """Converts a struct scalar of equal-length list scalars to a pd.DataFrame
 
         No validation is done, so the input must be a struct scalar with all fields being list scalars
         of the same lengths.
+
+        Parameters
+        ----------
+        value : pa.StructScalar
+            The struct scalar to convert.
+        copy : bool
+            Whether to copy the data.
+        na_value : Any, optional
+            The value to use for nulls.
+        pyarrow_dtypes : bool, optional
+            Whether to use pd.ArrowDtype. Nested fields will always
+            have NestedDtype.
         """
         if pa.compute.is_null(value).as_py():
             return na_value
-        d = {name: pd.Series(list_scalar.values, copy=copy) for name, list_scalar in value.items()}
-        return pd.DataFrame(d, copy=False)
+        series = {}
+        for name, list_scalar in value.items():
+            dtype = self.dtype.field_dtype(name)
+            # It gave pd.ArrowDtype for non-NestedDtype fields,
+            # make it None if we'd like to use pandas "ordinary" dtypes.
+            if not pyarrow_dtypes and not isinstance(dtype, NestedDtype):
+                dtype = None
+            series[name] = pd.Series(list_scalar.values, dtype=dtype, copy=copy, name=name)
+        return pd.DataFrame(series, copy=False)
 
     @property
     def _list_storage(self):
diff --git a/tests/nested_pandas/series/test_ext_array.py b/tests/nested_pandas/series/test_ext_array.py
@@ -6,6 +6,8 @@
 import pyarrow.compute as pc
 import pytest
 from nested_pandas import NestedDtype
+from nested_pandas.datasets import generate_data
+from nested_pandas.nestedframe.core import NestedFrame
 from nested_pandas.series.ext_array import NestedExtensionArray, convert_df_to_pa_scalar, replace_with_mask
 from numpy.testing import assert_array_equal
 from pandas.core.arrays import ArrowExtensionArray
@@ -297,6 +299,17 @@ def test_convert_df_to_pa_from_scalar():
     )
 
 
+def test_convert_df_to_pa_scalar_from_pyarrow_dtyped_df():
+    """Test that we can convert a frame with pd.ArrowDtype series to pyarrow struct_scalar."""
+    df = pd.DataFrame({"a": pd.Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int32())), "b": [-4.0, -5.0, -6.0]})
+    pa_scalar = convert_df_to_pa_scalar(df, pa_type=None)
+
+    assert pa_scalar == pa.scalar(
+        {"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]},
+        type=pa.struct([pa.field("a", pa.list_(pa.int32())), pa.field("b", pa.list_(pa.float64()))]),
+    )
+
+
 def test__box_pa_array_from_series_of_df():
     """Test that we can convert a DataFrame to a pyarrow scalar."""
     series = pd.Series(
@@ -692,20 +705,17 @@ def test_list_offsets_multiple_chunks():
 
 
 def test___getitem___with_integer():
-    """Test [i] is a valid DataFrame."""
-    struct_array = pa.StructArray.from_arrays(
-        arrays=[
-            pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
-            pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]),
-        ],
-        names=["a", "b"],
-    )
-    ext_array = NestedExtensionArray(struct_array)
+    """Test [i] is a valid DataFrame with NestedDtype propagated"""
+    nf = generate_data(10, 3)
+    # repeat index 3 and nest on it
+    nf["id"] = [0, 1, 2, 3, 3, 4, 5, 6, 7, 8]
+    nnf = NestedFrame.from_flat(nf, base_columns=[], on="id", name="outer")
+    ext_array = nnf["outer"].array
 
-    second_row_as_df = ext_array[1]
-    assert_frame_equal(
-        second_row_as_df, pd.DataFrame({"a": np.array([1.0, 2.0, 1.0]), "b": -np.array([3.0, 4.0, 5.0])})
-    )
+    actual = ext_array[3]
+    desired = pd.DataFrame(nf.query("id == 3").drop("id", axis=1)).reset_index(drop=True)
+
+    assert_frame_equal(actual, desired)
 
 
 def test___getitem___with_integer_ndarray():
diff --git a/tests/nested_pandas/series/test_packer.py b/tests/nested_pandas/series/test_packer.py
@@ -2,7 +2,8 @@
 import pandas as pd
 import pyarrow as pa
 import pytest
-from nested_pandas import NestedDtype
+from nested_pandas import NestedDtype, NestedFrame
+from nested_pandas.datasets import generate_data
 from nested_pandas.series import packer
 from numpy.testing import assert_array_equal
 from pandas.testing import assert_frame_equal, assert_series_equal
@@ -441,6 +442,17 @@ def test_pack_seq_with_series_of_dfs():
     assert_series_equal(series, desired)
 
 
+def test_pack_seq_with_double_nested():
+    """Test pack_seq works nice for frames with nested columns."""
+    nf = generate_data(10, 3)
+    nf["id"] = [0, 0, 1, 2, 2, 3, 4, 4, 5, 5]
+    desired = NestedFrame.from_flat(nf, base_columns=[], on="id", name="outer")["outer"].reset_index(
+        drop=True,
+    )
+    actual = packer.pack_seq(list(desired))
+    assert_frame_equal(actual.nest.to_flat(), desired.nest.to_flat())
+
+
 def test_view_sorted_df_as_list_arrays():
     """Test view_sorted_df_as_list_arrays()."""
     flat_df = pd.DataFrame(