Merge pull request #272 from lincc-frameworks/to_flatten_inner-repeated_index

hombit · web-flow · commit 9ab6f1d42a53 · 2025-05-27T09:35:45.000-04:00
Fix to_flatten_inner for repeated index
diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py
@@ -112,7 +112,7 @@ def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame:
         if len(fields) == 0:
             raise ValueError("Cannot flatten a struct with no fields")
 
-        index = pd.Series(self.get_flat_index(), name=self._series.index.name)
+        index = self.get_flat_index()
 
         flat_chunks: dict[str, list[pa.Array]] = {field: [] for field in fields}
         for chunk in self._series.array.struct_array.iterchunks():
@@ -557,6 +557,10 @@ def to_flatten_inner(self, field: str) -> pd.Series:
         2. All items of other fields are repeated as many times as that frame
           length.
 
+        It has the same effect as doing
+        `nested_df.drop(field, axis=1).join(nested_df[field].nest.to_flat())`
+        for each nested element of the Series.
+
         Parameters
         ----------
         field : str
@@ -565,14 +569,15 @@ def to_flatten_inner(self, field: str) -> pd.Series:
         Returns
         -------
         pd.Series
-            This series object, but with inner field exploded.
+            This series object, but with the inner field exploded.
 
         Examples
         --------
         >>> from nested_pandas import NestedFrame
         >>> from nested_pandas.datasets import generate_data
         >>> nf = generate_data(5, 2, seed=1).rename(columns={"nested": "inner"})
-        >>> # Assign a repeated ID to double-nest on
+
+        Assign a repeated ID to double-nest on
         >>> nf["id"] = [0, 0, 0, 1, 1]
         >>> nf
                   a         b                                              inner  id
@@ -593,29 +598,31 @@ def to_flatten_inner(self, field: str) -> pd.Series:
         3   2.807739  16.983042    r
         4   0.547752  87.638915    g
         4    3.96203   87.81425    r
-        >>> # Create a dataframe with double-nested column "outer"
+
+        Create a dataframe with double-nested column "outer"
         >>> dnf = NestedFrame.from_flat(nf, base_columns=[], on="id", name="outer")
-        >>> # Flat "inner" nested column.
-        >>> # This is like "concatenation" of the initial nf frame on duplicated `id` rows
+
+        Flat "inner" nested column.
+        This is like "concatenation" of the initial nf frame on duplicated `id` rows
         >>> concated_nf_series = dnf["outer"].nest.to_flatten_inner("inner")
         >>> concated_nf_series
         id
-        0    [{t: 8.38389, flux: 80.074457, band: 'r', a: 0...
-        1    [{t: 17.562349, flux: 69.232262, band: 'r', a:...
-        Name: inner, dtype: nested<t: [double], flux: [double], band: [string], a: [double], b: [double]>
+        0    [{a: 0.417022, b: 0.184677, t: 8.38389, flux: ...
+        1    [{a: 0.302333, b: 0.793535, t: 17.562349, flux...
+        Name: outer, dtype: nested<a: [double], b: [double], t: [double], flux: [double], band: [string]>
         >>> concated_nf_series.nest.to_flat()  # doctest: +NORMALIZE_WHITESPACE
-                    t       flux band         a         b
+                   a         b          t       flux band
         id
-        0     8.38389  80.074457    r  0.417022  0.184677
-        0    13.40935  89.460666    g  0.417022  0.184677
-        0    13.70439  96.826158    g  0.720324   0.37252
-        0    8.346096   8.504421    g  0.720324   0.37252
-        0    4.089045  31.342418    g  0.000114  0.691121
-        0   11.173797   3.905478    g  0.000114  0.691121
-        1   17.562349  69.232262    r  0.302333  0.793535
-        1    2.807739  16.983042    r  0.302333  0.793535
-        1    0.547752  87.638915    g  0.146756  1.077633
-        1     3.96203   87.81425    r  0.146756  1.077633
+        0   0.417022  0.184677    8.38389  80.074457    r
+        0   0.417022  0.184677   13.40935  89.460666    g
+        0   0.720324   0.37252   13.70439  96.826158    g
+        0   0.720324   0.37252   8.346096   8.504421    g
+        0   0.000114  0.691121   4.089045  31.342418    g
+        0   0.000114  0.691121  11.173797   3.905478    g
+        1   0.302333  0.793535  17.562349  69.232262    r
+        1   0.302333  0.793535   2.807739  16.983042    r
+        1   0.146756  1.077633   0.547752  87.638915    g
+        1   0.146756  1.077633    3.96203   87.81425    r
         """
         if not isinstance(self._series.dtype.field_dtype(field), NestedDtype):
             raise ValueError(
@@ -624,15 +631,34 @@ def to_flatten_inner(self, field: str) -> pd.Series:
 
         # Copy series and make an "ordinal" index
         series = self._series.reset_index(drop=True)
-        # Get a flat representation of the field
-        inner = self[field]
-        # Embed all other fields into the nested inner field, so the only field is left
-        for other_field in self.fields:
-            if other_field == field:
-                continue
-            inner = inner.nest.with_filled_field(other_field, series.nest[other_field])
-        # Repack flat inner back to nested series
-        result = pack_flat(inner.nest.to_flat(), name=field)
-        # Restore index
+
+        # Flat the array and set a multiindex.
+        # "outer" is the ordinal index over the original "top"-level series.
+        # "inner" is the ordinal index over the flatten series, e.g., over the first-level nested rows.
+        # "inner" has more unique values than "outer".
+        # The total number of double-nested rows is larger than "inner".
+        series_flatten = series.nest.to_flat()
+        series_flatten = series_flatten.set_index(
+            [
+                pd.Index(series_flatten.index, name="outer"),
+                pd.RangeIndex(len(series_flatten), name="inner"),
+            ]
+        )
+
+        # Use "inner" ordinal index for the join and drop it
+        field_flatten = series_flatten[field].nest.to_flat().reset_index("outer", drop=True)
+        inner_flatten = series_flatten.drop(field, axis=1).join(field_flatten, on="inner")
+        inner_flatten = inner_flatten.reset_index("inner", drop=True)
+
+        # Assign back the "outer" ordinal index and pack on it
+        result = pack_flat(inner_flatten, name=self._series.name)
+
+        # Some indexes may be missed if the original series had some NULLs
+        if len(result) < len(series):
+            nulls = pd.Series(None, index=series.index, dtype=result.dtype)
+            nulls[result.index] = result
+            result = nulls
+
+        # And put back the original index
         result.index = self._series.index
         return result
diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py
@@ -607,10 +607,22 @@ def __arrow_array__(self, type=None):
             return self.struct_array.cast(type)
         return self.list_array.cast(type)
 
-    def __array__(self, dtype=None):
-        """Convert the extension array to a numpy array."""
+    def __array__(self, dtype=None, copy=True):
+        """Convert the extension array to a numpy array.
+
+        Parameters
+        ----------
+        dtype : numpy.dtype, optional
+            The dtype of the resulting array
+        copy : bool, default True
+            Whether to return a copy of the data
 
-        array = self.to_numpy(dtype=dtype, copy=False)
+        Returns
+        -------
+        numpy.ndarray
+            The numpy array representation of the extension array
+        """
+        array = self.to_numpy(dtype=dtype, copy=copy)
 
         # Check if called inside _ExtensionArrayFormatter._format_strings
         # If yes, repack nested data-frames into a wrapper object, so
diff --git a/tests/nested_pandas/series/test_accessor.py b/tests/nested_pandas/series/test_accessor.py
@@ -3,7 +3,7 @@
 import pandas as pd
 import pyarrow as pa
 import pytest
-from nested_pandas import NestedDtype, NestedFrame
+from nested_pandas import NestedDtype, NestedFrame, read_parquet
 from nested_pandas.datasets import generate_data
 from nested_pandas.series.ext_array import NestedExtensionArray
 from nested_pandas.series.packer import pack_flat, pack_seq
@@ -1078,22 +1078,46 @@ def test_get_list_index():
 def test_to_flatten_inner():
     """Test .nest.to_flatten_inner()"""
     nf = generate_data(10, 2)
+    # Assign repeated index to make it harder
     nf["a"] = nf["a"].astype(pd.ArrowDtype(pa.float64()))
     nf["b"] = nf["b"].astype(pd.ArrowDtype(pa.float64()))
     nf = nf.assign(id=np.repeat(np.r_[0:5], 2))
     nf = nf.rename(columns={"nested": "inner"})
     nnf = NestedFrame.from_flat(nf, base_columns=[], on="id", name="outer")
+    nnf.index = ["a", "a", "b", "b", "c"]
 
-    outer_flatten = nnf["outer"].nest.to_flatten_inner("inner")
+    actual = nnf["outer"].nest.to_flatten_inner("inner")
 
-    assert_frame_equal(
-        nf.drop("inner", axis=1).join(nf["inner"].nest.to_flat()).set_index("id"),
-        outer_flatten.nest.to_flat(),
-        check_like=True,
-    )
+    desired_dfs = []
+    for nested_df in nnf["outer"]:
+        nested_df = NestedFrame(nested_df)
+        desired_df = nested_df.drop("inner", axis=1)
+        desired_df = desired_df.join(nested_df["inner"].nest.to_flat())
+        desired_dfs.append(desired_df)
+    desired = pack_seq(desired_dfs, index=["a", "a", "b", "b", "c"])
+
+    assert actual.shape == desired.shape
+    assert_frame_equal(actual.nest.to_flat(), desired.nest.to_flat(), check_like=True)
+
+
+def test_to_flatten_inner_empty_inner():
+    """Test .nest.to_flatten_inner for the case when inner frames are empty"""
+    nf = generate_data(10, 2)
+    nf["nested"][2:4] = [pd.DataFrame({"t": [], "flux": [], "band": []})] * 2
+    nf = nf.assign(id=np.repeat(np.r_[0:5], 2))
+    nf = nf.rename(columns={"nested": "inner"})
+    nnf = NestedFrame.from_flat(nf, base_columns=[], on="id", name="outer")
+
+    _actual = nnf["outer"].nest.to_flatten_inner("inner")
+
+
+def test_to_flatten_inner_none_nested():
+    """Test .nest.to_flatten_inner with vsx-x-ztfdr22_lc-m31.parquet file"""
+    nnf = read_parquet("tests/test_data/vsx-x-ztfdr22_lc-m31.parquet")
+    _actual = nnf["ztf"].nest.to_flatten_inner("lc")
 
 
-def test_to_flatten_outer_wrong_field():
+def test_to_flatten_inner_wrong_field():
     """Test an exception is raised when .nest.to_flatten_inner() called for a wrong field."""
     nf = generate_data(10, 2)
     with pytest.raises(ValueError):
@@ -1116,5 +1140,5 @@ def test_issue266():
     empty_outer_flatten = empty_nnf["outer"].nest.to_flatten_inner("inner")
 
     assert empty_outer_flatten.dtype == NestedDtype.from_fields(
-        {"t": pa.float64(), "flux": pa.float64(), "band": pa.string(), "a": pa.float64(), "b": pa.float64()}
+        {"a": pa.float64(), "b": pa.float64(), "t": pa.float64(), "flux": pa.float64(), "band": pa.string()}
     )
diff --git a/tests/test_data/vsx-x-ztfdr22_lc-m31.parquet b/tests/test_data/vsx-x-ztfdr22_lc-m31.parquet