Skip to content

Commit c136206

Browse files
authored
Merge pull request #267 from lincc-frameworks/empty-data-storages
Fix .nest.to_flatten_inner for empty inputs
2 parents bdfbce4 + 5703c39 commit c136206

File tree

5 files changed

+40
-7
lines changed

5 files changed

+40
-7
lines changed

src/nested_pandas/series/_storage/table_storage.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@
44

55
import pyarrow as pa
66

7-
from nested_pandas.series.utils import table_to_struct_array, validate_struct_list_array_for_equal_lengths
7+
from nested_pandas.series.utils import (
8+
table_from_struct_array,
9+
table_to_struct_array,
10+
validate_struct_list_array_for_equal_lengths,
11+
)
812

913
if TYPE_CHECKING:
1014
from nested_pandas.series._storage.list_struct_storage import ListStructStorage
@@ -58,5 +62,5 @@ def from_struct_list_storage(cls, struct_list_storage: StructListStorage) -> Sel
5862
struct_list_storage : StructListStorage
5963
StructListStorage object.
6064
"""
61-
table = pa.Table.from_struct_array(struct_list_storage.data)
65+
table = table_from_struct_array(struct_list_storage.data)
6266
return cls(table, validate=False)

src/nested_pandas/series/accessor.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# Python 3.9 doesn't support "|" for types
22
from __future__ import annotations
33

4-
from collections import defaultdict
54
from collections.abc import Generator, Mapping
65
from typing import cast
76

@@ -115,7 +114,7 @@ def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame:
115114

116115
index = pd.Series(self.get_flat_index(), name=self._series.index.name)
117116

118-
flat_chunks = defaultdict(list)
117+
flat_chunks: dict[str, list[pa.Array]] = {field: [] for field in fields}
119118
for chunk in self._series.array.struct_array.iterchunks():
120119
struct_array = cast(pa.StructArray, chunk)
121120
for field in fields:
@@ -125,13 +124,16 @@ def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame:
125124

126125
flat_series = {}
127126
for field, chunks in flat_chunks.items():
127+
dtype = self._series.dtype.field_dtype(field)
128+
if len(chunks) == 0:
129+
chunks = [pa.array([])]
128130
chunked_array = pa.chunked_array(chunks)
129131
flat_series[field] = pd.Series(
130132
chunked_array,
131133
index=index,
132134
name=field,
133135
copy=False,
134-
dtype=self._series.dtype.field_dtype(field),
136+
dtype=dtype,
135137
)
136138

137139
return pd.DataFrame(flat_series)

src/nested_pandas/series/packer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,8 +209,8 @@ def pack_lists(df: pd.DataFrame, name: str | None = None, *, validate: bool = Tr
209209
chunk_lengths = pa.array([[len(chunk) for chunk in arr.chunks] for arr in pa_chunked_arrays.values()])
210210
if all(chunk_length == chunk_lengths[0] for chunk_length in chunk_lengths):
211211
chunks = []
212-
numpy_chunks = next(iter(pa_chunked_arrays.values())).num_chunks
213-
for i in range(numpy_chunks):
212+
num_chunks = next(iter(pa_chunked_arrays.values())).num_chunks
213+
for i in range(num_chunks):
214214
chunks.append(
215215
pa.StructArray.from_arrays(
216216
[arr.chunk(i) for arr in pa_chunked_arrays.values()],

src/nested_pandas/series/utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,3 +307,10 @@ def table_to_struct_array(table: pa.Table) -> pa.ChunkedArray:
307307
if len(table) == 0:
308308
return pa.chunked_array([], type=pa.struct(table.schema))
309309
return table.to_struct_array()
310+
311+
312+
def table_from_struct_array(array: pa.ChunkedArray | pa.array) -> pa.Table:
313+
"""pa.Table.from_struct_array, but working with chunkless input"""
314+
if isinstance(array, pa.ChunkedArray) and array.num_chunks == 0:
315+
array = pa.array([], type=array.type)
316+
return pa.Table.from_struct_array(array)

tests/nested_pandas/series/test_accessor.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,3 +1098,23 @@ def test_to_flatten_outer_wrong_field():
10981098
nf = generate_data(10, 2)
10991099
with pytest.raises(ValueError):
11001100
nf.nested.nest.to_flatten_inner("t")
1101+
1102+
1103+
def test_issue266():
1104+
"""Test .nest.to_flatten_inner() with empty series.
1105+
1106+
https://github.com/lincc-frameworks/nested-pandas/issues/266
1107+
"""
1108+
1109+
nf = generate_data(10, 2)
1110+
nf = nf.assign(id=np.repeat(np.r_[0:5], 2))
1111+
nf = nf.rename(columns={"nested": "inner"})
1112+
nnf = NestedFrame.from_flat(nf, base_columns=[], on="id", name="outer")
1113+
1114+
empty_nnf = nnf.iloc[0:0]
1115+
1116+
empty_outer_flatten = empty_nnf["outer"].nest.to_flatten_inner("inner")
1117+
1118+
assert empty_outer_flatten.dtype == NestedDtype.from_fields(
1119+
{"t": pa.float64(), "flux": pa.float64(), "band": pa.string(), "a": pa.float64(), "b": pa.float64()}
1120+
)

0 commit comments

Comments
 (0)