@@ -112,7 +112,7 @@ def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame:
112
112
if len (fields ) == 0 :
113
113
raise ValueError ("Cannot flatten a struct with no fields" )
114
114
115
- index = pd . Series ( self .get_flat_index (), name = self . _series . index . name )
115
+ index = self .get_flat_index ()
116
116
117
117
flat_chunks : dict [str , list [pa .Array ]] = {field : [] for field in fields }
118
118
for chunk in self ._series .array .struct_array .iterchunks ():
@@ -557,6 +557,10 @@ def to_flatten_inner(self, field: str) -> pd.Series:
557
557
2. All items of other fields are repeated as many times as that frame
558
558
length.
559
559
560
+ It has the same effect as doing
561
+ `nested_df.drop(field, axis=1).join(nested_df[field].nest.to_flat())`
562
+ for each nested element of the Series.
563
+
560
564
Parameters
561
565
----------
562
566
field : str
@@ -565,14 +569,15 @@ def to_flatten_inner(self, field: str) -> pd.Series:
565
569
Returns
566
570
-------
567
571
pd.Series
568
- This series object, but with inner field exploded.
572
+ This series object, but with the inner field exploded.
569
573
570
574
Examples
571
575
--------
572
576
>>> from nested_pandas import NestedFrame
573
577
>>> from nested_pandas.datasets import generate_data
574
578
>>> nf = generate_data(5, 2, seed=1).rename(columns={"nested": "inner"})
575
- >>> # Assign a repeated ID to double-nest on
579
+
580
+ Assign a repeated ID to double-nest on
576
581
>>> nf["id"] = [0, 0, 0, 1, 1]
577
582
>>> nf
578
583
a b inner id
@@ -593,29 +598,31 @@ def to_flatten_inner(self, field: str) -> pd.Series:
593
598
3 2.807739 16.983042 r
594
599
4 0.547752 87.638915 g
595
600
4 3.96203 87.81425 r
596
- >>> # Create a dataframe with double-nested column "outer"
601
+
602
+ Create a dataframe with double-nested column "outer"
597
603
>>> dnf = NestedFrame.from_flat(nf, base_columns=[], on="id", name="outer")
598
- >>> # Flat "inner" nested column.
599
- >>> # This is like "concatenation" of the initial nf frame on duplicated `id` rows
604
+
605
+ Flat "inner" nested column.
606
+ This is like "concatenation" of the initial nf frame on duplicated `id` rows
600
607
>>> concated_nf_series = dnf["outer"].nest.to_flatten_inner("inner")
601
608
>>> concated_nf_series
602
609
id
603
- 0 [{t: 8.38389, flux: 80.074457, band: 'r', a: 0 ...
604
- 1 [{t: 17.562349, flux: 69.232262, band: 'r', a: ...
605
- Name: inner , dtype: nested<t : [double], flux : [double], band : [string ], a : [double], b : [double ]>
610
+ 0 [{a: 0.417022, b: 0.184677, t: 8.38389, flux: ...
611
+ 1 [{a: 0.302333, b: 0.793535, t: 17.562349, flux ...
612
+ Name: outer , dtype: nested<a : [double], b : [double], t : [double ], flux : [double], band : [string ]>
606
613
>>> concated_nf_series.nest.to_flat() # doctest: +NORMALIZE_WHITESPACE
607
- t flux band a b
614
+ a b t flux band
608
615
id
609
- 0 8.38389 80.074457 r 0.417022 0.184677
610
- 0 13.40935 89.460666 g 0.417022 0.184677
611
- 0 13.70439 96.826158 g 0.720324 0.37252
612
- 0 8.346096 8.504421 g 0.720324 0.37252
613
- 0 4.089045 31.342418 g 0.000114 0.691121
614
- 0 11.173797 3.905478 g 0.000114 0.691121
615
- 1 17.562349 69.232262 r 0.302333 0.793535
616
- 1 2.807739 16.983042 r 0.302333 0.793535
617
- 1 0.547752 87.638915 g 0.146756 1.077633
618
- 1 3.96203 87.81425 r 0.146756 1.077633
616
+ 0 0.417022 0.184677 8.38389 80.074457 r
617
+ 0 0.417022 0.184677 13.40935 89.460666 g
618
+ 0 0.720324 0.37252 13.70439 96.826158 g
619
+ 0 0.720324 0.37252 8.346096 8.504421 g
620
+ 0 0.000114 0.691121 4.089045 31.342418 g
621
+ 0 0.000114 0.691121 11.173797 3.905478 g
622
+ 1 0.302333 0.793535 17.562349 69.232262 r
623
+ 1 0.302333 0.793535 2.807739 16.983042 r
624
+ 1 0.146756 1.077633 0.547752 87.638915 g
625
+ 1 0.146756 1.077633 3.96203 87.81425 r
619
626
"""
620
627
if not isinstance (self ._series .dtype .field_dtype (field ), NestedDtype ):
621
628
raise ValueError (
@@ -624,15 +631,34 @@ def to_flatten_inner(self, field: str) -> pd.Series:
624
631
625
632
# Copy series and make an "ordinal" index
626
633
series = self ._series .reset_index (drop = True )
627
- # Get a flat representation of the field
628
- inner = self [field ]
629
- # Embed all other fields into the nested inner field, so the only field is left
630
- for other_field in self .fields :
631
- if other_field == field :
632
- continue
633
- inner = inner .nest .with_filled_field (other_field , series .nest [other_field ])
634
- # Repack flat inner back to nested series
635
- result = pack_flat (inner .nest .to_flat (), name = field )
636
- # Restore index
634
+
635
+ # Flat the array and set a multiindex.
636
+ # "outer" is the ordinal index over the original "top"-level series.
637
+ # "inner" is the ordinal index over the flatten series, e.g., over the first-level nested rows.
638
+ # "inner" has more unique values than "outer".
639
+ # The total number of double-nested rows is larger than "inner".
640
+ series_flatten = series .nest .to_flat ()
641
+ series_flatten = series_flatten .set_index (
642
+ [
643
+ pd .Index (series_flatten .index , name = "outer" ),
644
+ pd .RangeIndex (len (series_flatten ), name = "inner" ),
645
+ ]
646
+ )
647
+
648
+ # Use "inner" ordinal index for the join and drop it
649
+ field_flatten = series_flatten [field ].nest .to_flat ().reset_index ("outer" , drop = True )
650
+ inner_flatten = series_flatten .drop (field , axis = 1 ).join (field_flatten , on = "inner" )
651
+ inner_flatten = inner_flatten .reset_index ("inner" , drop = True )
652
+
653
+ # Assign back the "outer" ordinal index and pack on it
654
+ result = pack_flat (inner_flatten , name = self ._series .name )
655
+
656
+ # Some indexes may be missed if the original series had some NULLs
657
+ if len (result ) < len (series ):
658
+ nulls = pd .Series (None , index = series .index , dtype = result .dtype )
659
+ nulls [result .index ] = result
660
+ result = nulls
661
+
662
+ # And put back the original index
637
663
result .index = self ._series .index
638
664
return result
0 commit comments