Skip to content

Commit 9ab6f1d

Browse files
authored
Merge pull request #272 from lincc-frameworks/to_flatten_inner-repeated_index
Fix to_flatten_inner for repeated index
2 parents 0980e1c + 79e0ed5 commit 9ab6f1d

File tree

4 files changed

+104
-42
lines changed

4 files changed

+104
-42
lines changed

src/nested_pandas/series/accessor.py

Lines changed: 56 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame:
112112
if len(fields) == 0:
113113
raise ValueError("Cannot flatten a struct with no fields")
114114

115-
index = pd.Series(self.get_flat_index(), name=self._series.index.name)
115+
index = self.get_flat_index()
116116

117117
flat_chunks: dict[str, list[pa.Array]] = {field: [] for field in fields}
118118
for chunk in self._series.array.struct_array.iterchunks():
@@ -557,6 +557,10 @@ def to_flatten_inner(self, field: str) -> pd.Series:
557557
2. All items of other fields are repeated as many times as that frame
558558
length.
559559
560+
It has the same effect as doing
561+
`nested_df.drop(field, axis=1).join(nested_df[field].nest.to_flat())`
562+
for each nested element of the Series.
563+
560564
Parameters
561565
----------
562566
field : str
@@ -565,14 +569,15 @@ def to_flatten_inner(self, field: str) -> pd.Series:
565569
Returns
566570
-------
567571
pd.Series
568-
This series object, but with inner field exploded.
572+
This series object, but with the inner field exploded.
569573
570574
Examples
571575
--------
572576
>>> from nested_pandas import NestedFrame
573577
>>> from nested_pandas.datasets import generate_data
574578
>>> nf = generate_data(5, 2, seed=1).rename(columns={"nested": "inner"})
575-
>>> # Assign a repeated ID to double-nest on
579+
580+
Assign a repeated ID to double-nest on
576581
>>> nf["id"] = [0, 0, 0, 1, 1]
577582
>>> nf
578583
a b inner id
@@ -593,29 +598,31 @@ def to_flatten_inner(self, field: str) -> pd.Series:
593598
3 2.807739 16.983042 r
594599
4 0.547752 87.638915 g
595600
4 3.96203 87.81425 r
596-
>>> # Create a dataframe with double-nested column "outer"
601+
602+
Create a dataframe with double-nested column "outer"
597603
>>> dnf = NestedFrame.from_flat(nf, base_columns=[], on="id", name="outer")
598-
>>> # Flat "inner" nested column.
599-
>>> # This is like "concatenation" of the initial nf frame on duplicated `id` rows
604+
605+
Flat "inner" nested column.
606+
This is like "concatenation" of the initial nf frame on duplicated `id` rows
600607
>>> concated_nf_series = dnf["outer"].nest.to_flatten_inner("inner")
601608
>>> concated_nf_series
602609
id
603-
0 [{t: 8.38389, flux: 80.074457, band: 'r', a: 0...
604-
1 [{t: 17.562349, flux: 69.232262, band: 'r', a:...
605-
Name: inner, dtype: nested<t: [double], flux: [double], band: [string], a: [double], b: [double]>
610+
0 [{a: 0.417022, b: 0.184677, t: 8.38389, flux: ...
611+
1 [{a: 0.302333, b: 0.793535, t: 17.562349, flux...
612+
Name: outer, dtype: nested<a: [double], b: [double], t: [double], flux: [double], band: [string]>
606613
>>> concated_nf_series.nest.to_flat() # doctest: +NORMALIZE_WHITESPACE
607-
t flux band a b
614+
a b t flux band
608615
id
609-
0 8.38389 80.074457 r 0.417022 0.184677
610-
0 13.40935 89.460666 g 0.417022 0.184677
611-
0 13.70439 96.826158 g 0.720324 0.37252
612-
0 8.346096 8.504421 g 0.720324 0.37252
613-
0 4.089045 31.342418 g 0.000114 0.691121
614-
0 11.173797 3.905478 g 0.000114 0.691121
615-
1 17.562349 69.232262 r 0.302333 0.793535
616-
1 2.807739 16.983042 r 0.302333 0.793535
617-
1 0.547752 87.638915 g 0.146756 1.077633
618-
1 3.96203 87.81425 r 0.146756 1.077633
616+
0 0.417022 0.184677 8.38389 80.074457 r
617+
0 0.417022 0.184677 13.40935 89.460666 g
618+
0 0.720324 0.37252 13.70439 96.826158 g
619+
0 0.720324 0.37252 8.346096 8.504421 g
620+
0 0.000114 0.691121 4.089045 31.342418 g
621+
0 0.000114 0.691121 11.173797 3.905478 g
622+
1 0.302333 0.793535 17.562349 69.232262 r
623+
1 0.302333 0.793535 2.807739 16.983042 r
624+
1 0.146756 1.077633 0.547752 87.638915 g
625+
1 0.146756 1.077633 3.96203 87.81425 r
619626
"""
620627
if not isinstance(self._series.dtype.field_dtype(field), NestedDtype):
621628
raise ValueError(
@@ -624,15 +631,34 @@ def to_flatten_inner(self, field: str) -> pd.Series:
624631

625632
# Copy series and make an "ordinal" index
626633
series = self._series.reset_index(drop=True)
627-
# Get a flat representation of the field
628-
inner = self[field]
629-
# Embed all other fields into the nested inner field, so the only field is left
630-
for other_field in self.fields:
631-
if other_field == field:
632-
continue
633-
inner = inner.nest.with_filled_field(other_field, series.nest[other_field])
634-
# Repack flat inner back to nested series
635-
result = pack_flat(inner.nest.to_flat(), name=field)
636-
# Restore index
634+
635+
# Flat the array and set a multiindex.
636+
# "outer" is the ordinal index over the original "top"-level series.
637+
# "inner" is the ordinal index over the flatten series, e.g., over the first-level nested rows.
638+
# "inner" has more unique values than "outer".
639+
# The total number of double-nested rows is larger than "inner".
640+
series_flatten = series.nest.to_flat()
641+
series_flatten = series_flatten.set_index(
642+
[
643+
pd.Index(series_flatten.index, name="outer"),
644+
pd.RangeIndex(len(series_flatten), name="inner"),
645+
]
646+
)
647+
648+
# Use "inner" ordinal index for the join and drop it
649+
field_flatten = series_flatten[field].nest.to_flat().reset_index("outer", drop=True)
650+
inner_flatten = series_flatten.drop(field, axis=1).join(field_flatten, on="inner")
651+
inner_flatten = inner_flatten.reset_index("inner", drop=True)
652+
653+
# Assign back the "outer" ordinal index and pack on it
654+
result = pack_flat(inner_flatten, name=self._series.name)
655+
656+
# Some indexes may be missed if the original series had some NULLs
657+
if len(result) < len(series):
658+
nulls = pd.Series(None, index=series.index, dtype=result.dtype)
659+
nulls[result.index] = result
660+
result = nulls
661+
662+
# And put back the original index
637663
result.index = self._series.index
638664
return result

src/nested_pandas/series/ext_array.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -607,10 +607,22 @@ def __arrow_array__(self, type=None):
607607
return self.struct_array.cast(type)
608608
return self.list_array.cast(type)
609609

610-
def __array__(self, dtype=None):
611-
"""Convert the extension array to a numpy array."""
610+
def __array__(self, dtype=None, copy=True):
611+
"""Convert the extension array to a numpy array.
612+
613+
Parameters
614+
----------
615+
dtype : numpy.dtype, optional
616+
The dtype of the resulting array
617+
copy : bool, default True
618+
Whether to return a copy of the data
612619
613-
array = self.to_numpy(dtype=dtype, copy=False)
620+
Returns
621+
-------
622+
numpy.ndarray
623+
The numpy array representation of the extension array
624+
"""
625+
array = self.to_numpy(dtype=dtype, copy=copy)
614626

615627
# Check if called inside _ExtensionArrayFormatter._format_strings
616628
# If yes, repack nested data-frames into a wrapper object, so

tests/nested_pandas/series/test_accessor.py

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import pandas as pd
44
import pyarrow as pa
55
import pytest
6-
from nested_pandas import NestedDtype, NestedFrame
6+
from nested_pandas import NestedDtype, NestedFrame, read_parquet
77
from nested_pandas.datasets import generate_data
88
from nested_pandas.series.ext_array import NestedExtensionArray
99
from nested_pandas.series.packer import pack_flat, pack_seq
@@ -1078,22 +1078,46 @@ def test_get_list_index():
10781078
def test_to_flatten_inner():
10791079
"""Test .nest.to_flatten_inner()"""
10801080
nf = generate_data(10, 2)
1081+
# Assign repeated index to make it harder
10811082
nf["a"] = nf["a"].astype(pd.ArrowDtype(pa.float64()))
10821083
nf["b"] = nf["b"].astype(pd.ArrowDtype(pa.float64()))
10831084
nf = nf.assign(id=np.repeat(np.r_[0:5], 2))
10841085
nf = nf.rename(columns={"nested": "inner"})
10851086
nnf = NestedFrame.from_flat(nf, base_columns=[], on="id", name="outer")
1087+
nnf.index = ["a", "a", "b", "b", "c"]
10861088

1087-
outer_flatten = nnf["outer"].nest.to_flatten_inner("inner")
1089+
actual = nnf["outer"].nest.to_flatten_inner("inner")
10881090

1089-
assert_frame_equal(
1090-
nf.drop("inner", axis=1).join(nf["inner"].nest.to_flat()).set_index("id"),
1091-
outer_flatten.nest.to_flat(),
1092-
check_like=True,
1093-
)
1091+
desired_dfs = []
1092+
for nested_df in nnf["outer"]:
1093+
nested_df = NestedFrame(nested_df)
1094+
desired_df = nested_df.drop("inner", axis=1)
1095+
desired_df = desired_df.join(nested_df["inner"].nest.to_flat())
1096+
desired_dfs.append(desired_df)
1097+
desired = pack_seq(desired_dfs, index=["a", "a", "b", "b", "c"])
1098+
1099+
assert actual.shape == desired.shape
1100+
assert_frame_equal(actual.nest.to_flat(), desired.nest.to_flat(), check_like=True)
1101+
1102+
1103+
def test_to_flatten_inner_empty_inner():
1104+
"""Test .nest.to_flatten_inner for the case when inner frames are empty"""
1105+
nf = generate_data(10, 2)
1106+
nf["nested"][2:4] = [pd.DataFrame({"t": [], "flux": [], "band": []})] * 2
1107+
nf = nf.assign(id=np.repeat(np.r_[0:5], 2))
1108+
nf = nf.rename(columns={"nested": "inner"})
1109+
nnf = NestedFrame.from_flat(nf, base_columns=[], on="id", name="outer")
1110+
1111+
_actual = nnf["outer"].nest.to_flatten_inner("inner")
1112+
1113+
1114+
def test_to_flatten_inner_none_nested():
1115+
"""Test .nest.to_flatten_inner with vsx-x-ztfdr22_lc-m31.parquet file"""
1116+
nnf = read_parquet("tests/test_data/vsx-x-ztfdr22_lc-m31.parquet")
1117+
_actual = nnf["ztf"].nest.to_flatten_inner("lc")
10941118

10951119

1096-
def test_to_flatten_outer_wrong_field():
1120+
def test_to_flatten_inner_wrong_field():
10971121
"""Test an exception is raised when .nest.to_flatten_inner() called for a wrong field."""
10981122
nf = generate_data(10, 2)
10991123
with pytest.raises(ValueError):
@@ -1116,5 +1140,5 @@ def test_issue266():
11161140
empty_outer_flatten = empty_nnf["outer"].nest.to_flatten_inner("inner")
11171141

11181142
assert empty_outer_flatten.dtype == NestedDtype.from_fields(
1119-
{"t": pa.float64(), "flux": pa.float64(), "band": pa.string(), "a": pa.float64(), "b": pa.float64()}
1143+
{"a": pa.float64(), "b": pa.float64(), "t": pa.float64(), "flux": pa.float64(), "band": pa.string()}
11201144
)
9.1 KB
Binary file not shown.

0 commit comments

Comments
 (0)