Skip to content

Commit cb5b36d

Browse files
authored
Merge pull request #371 from lincc-frameworks/allign-offsets
Align list-array offsets on struct-list validation
2 parents c29f7ee + c11adc6 commit cb5b36d

File tree

5 files changed

+198
-26
lines changed

5 files changed

+198
-26
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ dependencies = [
2121
"numpy>=2",
2222
# We use internal pd._libs.missing and experimental ArrowExtensionArray
2323
"pandas>=2.2.3,<2.4",
24-
"pyarrow>=16", # remove struct_field_names when upgraded to 18+
24+
"pyarrow>=16", # remove struct_field_names() and struct_fields() when upgraded to 18+
2525
"universal_pathlib>=0.2",
2626
]
2727

src/nested_pandas/series/_storage/struct_list_storage.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
import pyarrow as pa
77

88
from nested_pandas.series.utils import (
9+
align_chunked_struct_list_offsets,
910
table_to_struct_array,
1011
transpose_list_struct_chunked,
11-
validate_struct_list_array_for_equal_lengths,
1212
)
1313

1414
if TYPE_CHECKING:
@@ -25,7 +25,9 @@ class StructListStorage:
2525
Pyarrow struct-array with all fields to be list-arrays.
2626
All list-values must be "aligned", e.g., have the same length.
2727
validate : bool (default True)
28-
Check that all the lists have the same lengths for each struct-value.
28+
Check that all the lists have the same lengths for each struct-value,
29+
and if all list offset arrays are the same. Fails for the first check,
30+
and reallocates the data for the second check.
2931
"""
3032

3133
_data: pa.ChunkedArray
@@ -37,8 +39,7 @@ def __init__(self, array: pa.StructArray | pa.ChunkedArray, *, validate: bool =
3739
raise ValueError("array must be a StructArray or ChunkedArray")
3840

3941
if validate:
40-
for chunk in array.chunks:
41-
validate_struct_list_array_for_equal_lengths(chunk)
42+
array = align_chunked_struct_list_offsets(array)
4243

4344
self._data = array
4445

src/nested_pandas/series/_storage/table_storage.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
import pyarrow as pa
66

77
from nested_pandas.series.utils import (
8+
align_chunked_struct_list_offsets,
89
table_from_struct_array,
910
table_to_struct_array,
10-
validate_struct_list_array_for_equal_lengths,
1111
)
1212

1313
if TYPE_CHECKING:
@@ -30,8 +30,8 @@ class TableStorage:
3030
def __init__(self, table: pa.Table, validate: bool = True) -> None:
3131
if validate:
3232
struct_array = table_to_struct_array(table)
33-
for chunk in struct_array.iterchunks():
34-
validate_struct_list_array_for_equal_lengths(chunk)
33+
aligned_struct_array = align_chunked_struct_list_offsets(struct_array)
34+
table = table_from_struct_array(aligned_struct_array)
3535

3636
self._data = table
3737

src/nested_pandas/series/utils.py

Lines changed: 90 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,15 @@ def struct_field_names(struct_type: pa.StructType) -> list[str]:
2020
return [f.name for f in struct_type]
2121

2222

23+
def struct_fields(struct_type: pa.StructType) -> list[pa.Field]:
24+
"""Return fields of a pyarrow.StructType in a pyarrow<18-compatible way.
25+
26+
Note: Once we bump our pyarrow requirement to ">=18", this helper can be
27+
replaced with direct usage of ``struct_type.fields`` throughout the codebase.
28+
"""
29+
return [struct_type.field(i) for i in range(struct_type.num_fields)]
30+
31+
2332
def is_pa_type_a_list(pa_type: pa.DataType) -> bool:
2433
"""Check if the given pyarrow type is a list type.
2534
@@ -58,36 +67,97 @@ def is_pa_type_is_list_struct(pa_type: pa.DataType) -> bool:
5867
return is_pa_type_a_list(pa_type) and pa.types.is_struct(pa_type.value_type)
5968

6069

61-
def validate_struct_list_array_for_equal_lengths(array: pa.StructArray) -> None:
62-
"""Check if the given struct array has lists of equal length.
70+
def align_struct_list_offsets(array: pa.StructArray) -> pa.StructArray:
71+
"""Checks if all struct-list offsets are the same, and reallocates if needed
6372
6473
Parameters
6574
----------
6675
array : pa.StructArray
6776
Input struct array.
6877
78+
Returns
79+
-------
80+
pa.StructArray
81+
Array with all struct-list offsets aligned. May be the input,
82+
if it was valid.
83+
6984
Raises
7085
------
7186
ValueError
72-
If the struct array has lists of unequal length or type of the input
73-
array is not a StructArray or fields are not ListArrays.
87+
If the input is not a valid "nested" StructArray.
7488
"""
7589
if not pa.types.is_struct(array.type):
7690
raise ValueError(f"Expected a StructArray, got {array.type}")
7791

78-
first_list_array: pa.ListArray | None = None
92+
first_offsets: pa.ListArray | None = None
7993
for field in array.type:
8094
inner_array = array.field(field.name)
8195
if not is_pa_type_a_list(inner_array.type):
8296
raise ValueError(f"Expected a ListArray, got {inner_array.type}")
8397
list_array = cast(pa.ListArray, inner_array)
8498

85-
if first_list_array is None:
86-
first_list_array = list_array
99+
if first_offsets is None:
100+
first_offsets = list_array.offsets
87101
continue
88102
# compare offsets from the first list array with the current one
89-
if not first_list_array.offsets.equals(list_array.offsets):
90-
raise ValueError("Offsets of all ListArrays must be the same")
103+
if not first_offsets.equals(list_array.offsets):
104+
break
105+
else:
106+
# Return the original array if all offsets match
107+
return array
108+
109+
new_offsets = pa.compute.subtract(first_offsets, first_offsets[0])
110+
value_lengths = None
111+
list_arrays = []
112+
for field in array.type:
113+
inner_array = array.field(field.name)
114+
list_array = cast(pa.ListArray, inner_array)
115+
116+
if value_lengths is None:
117+
value_lengths = list_array.value_lengths()
118+
elif not value_lengths.equals(list_array.value_lengths()):
119+
raise ValueError(
120+
f"List lengths do not match for struct fields {array.type.field(0).name} and {field.name}",
121+
)
122+
123+
list_arrays.append(
124+
pa.ListArray.from_arrays(
125+
values=list_array.values[list_array.offsets[0].as_py() : list_array.offsets[-1].as_py()],
126+
offsets=new_offsets,
127+
)
128+
)
129+
new_array = pa.StructArray.from_arrays(
130+
arrays=list_arrays,
131+
fields=struct_fields(array.type),
132+
)
133+
return new_array
134+
135+
136+
def align_chunked_struct_list_offsets(array: pa.Array | pa.ChunkedArray) -> pa.ChunkedArray:
137+
"""Checks if all struct-list offsets are the same, and reallocates if needed
138+
139+
Parameters
140+
----------
141+
array : pa.ChunkedArray or pa.Array
142+
Input chunked array, it must be a valid "nested" struct-list array,
143+
e.g. all list lengths must match. Non-chunked arrays are allowed,
144+
but the return array will always be chunked.
145+
146+
Returns
147+
-------
148+
pa.ChunkedArray
149+
Chunked array with all struct-list offsets aligned.
150+
151+
Raises
152+
------
153+
ValueError
154+
If the input is not a valid "nested" struct-list-array.
155+
"""
156+
if isinstance(array, pa.Array):
157+
array = pa.chunked_array([array])
158+
chunks = [align_struct_list_offsets(chunk) for chunk in array.iterchunks()]
159+
# Provide type for the case of zero-chunks array
160+
return pa.chunked_array(chunks, type=array.type)
91161

92162

93163
def transpose_struct_list_type(t: pa.StructType) -> pa.ListType:
@@ -139,7 +209,7 @@ def transpose_struct_list_array(array: pa.StructArray, validate: bool = True) ->
139209
List array of structs.
140210
"""
141211
if validate:
142-
validate_struct_list_array_for_equal_lengths(array)
212+
array = align_struct_list_offsets(array)
143213

144214
mask = array.is_null()
145215
if not pa.compute.any(mask).as_py():
@@ -220,6 +290,16 @@ def validate_list_struct_type(t: pa.ListType) -> None:
220290
raise ValueError(f"Expected a StructType as a list value type, got {t.value_type}")
221291

222292

293+
def validate_struct_list_type(t: pa.StructType) -> None:
294+
"""Raise a ValueError if not a struct-list-type."""
295+
if not pa.types.is_struct(t):
296+
raise ValueError(f"Expected a StructType, got {t}")
297+
298+
for field in struct_fields(t):
299+
if not is_pa_type_a_list(field.type):
300+
raise ValueError(f"Expected a ListType for field {field.name}, got {field.type}")
301+
302+
223303
def transpose_list_struct_type(t: pa.ListType) -> pa.StructType:
224304
"""Converts a type of list-struct array into a type of struct-list array.
225305

tests/nested_pandas/series/test_series_utils.py

Lines changed: 99 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,34 +3,36 @@
33
import pytest
44
from nested_pandas import NestedDtype
55
from nested_pandas.series.utils import (
6+
align_chunked_struct_list_offsets,
7+
align_struct_list_offsets,
68
nested_types_mapper,
79
struct_field_names,
810
transpose_list_struct_array,
911
transpose_list_struct_scalar,
1012
transpose_list_struct_type,
1113
transpose_struct_list_array,
1214
transpose_struct_list_type,
13-
validate_struct_list_array_for_equal_lengths,
15+
validate_struct_list_type,
1416
)
1517

1618

17-
def test_validate_struct_list_array_for_equal_lengths():
18-
"""Test validate_struct_list_array_for_equal_lengths function."""
19+
def test_align_struct_list_offsets():
20+
"""Test align_struct_list_offsets function."""
1921
# Raises for wrong types
2022
with pytest.raises(ValueError):
21-
validate_struct_list_array_for_equal_lengths(pa.array([], type=pa.int64()))
23+
align_struct_list_offsets(pa.array([], type=pa.int64()))
2224
with pytest.raises(ValueError):
23-
validate_struct_list_array_for_equal_lengths(pa.array([], type=pa.list_(pa.int64())))
25+
align_struct_list_offsets(pa.array([], type=pa.list_(pa.int64())))
2426

2527
# Raises if one of the fields is not a ListArray
2628
with pytest.raises(ValueError):
27-
validate_struct_list_array_for_equal_lengths(
29+
align_struct_list_offsets(
2830
pa.StructArray.from_arrays([pa.array([[1, 2], [3, 4, 5]]), pa.array([1, 2])], ["a", "b"])
2931
)
3032

3133
# Raises for mismatched lengths
3234
with pytest.raises(ValueError):
33-
validate_struct_list_array_for_equal_lengths(
35+
align_struct_list_offsets(
3436
pa.StructArray.from_arrays(
3537
[pa.array([[1, 2], [3, 4, 5]]), pa.array([[1, 2, 3], [4, 5]])], ["a", "b"]
3638
)
@@ -43,7 +45,96 @@ def test_validate_struct_list_array_for_equal_lengths():
4345
],
4446
names=["a", "b"],
4547
)
46-
assert validate_struct_list_array_for_equal_lengths(input_array) is None
48+
assert align_struct_list_offsets(input_array) is input_array
49+
50+
a = pa.array([[0, 0, 0], [1, 2], [3, 4], [], [5, 6, 7]])[1:]
51+
assert a.offsets[0].as_py() == 3
52+
b = pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]])
53+
assert b.offsets[0].as_py() == 0
54+
input_array = pa.StructArray.from_arrays(
55+
arrays=[a, b],
56+
names=["a", "b"],
57+
)
58+
aligned_array = align_struct_list_offsets(input_array)
59+
assert aligned_array is not input_array
60+
assert aligned_array.equals(input_array)
61+
62+
63+
def test_align_chunked_struct_list_offsets():
64+
"""Test align_chunked_struct_list_offsets function."""
65+
# Input is an array, output is chunked array
66+
a = pa.array([[1, 2], [3, 4], [], [5, 6, 7]])
67+
b = pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]])
68+
input_array = pa.StructArray.from_arrays(
69+
arrays=[a, b],
70+
names=["a", "b"],
71+
)
72+
output_array = align_chunked_struct_list_offsets(input_array)
73+
assert isinstance(output_array, pa.ChunkedArray)
74+
assert output_array.equals(pa.chunked_array([input_array]))
75+
76+
# Input is an "aligned" chunked array
77+
input_array = pa.chunked_array(
78+
[
79+
pa.StructArray.from_arrays(
80+
arrays=[a, b],
81+
names=["a", "b"],
82+
)
83+
]
84+
* 2
85+
)
86+
output_array = align_chunked_struct_list_offsets(input_array)
87+
assert output_array.equals(input_array)
88+
89+
# Input is an "aligned" chunked array, but offsets do not start with zero
90+
a = pa.array([[0, 0, 0], [1, 2], [3, 4], [], [5, 6, 7]])[1:]
91+
b = pa.array([["a", "a", "a", "a"], ["x", "y"], ["y", "x"], [], ["d", "e", "f"]])[1:]
92+
input_array = pa.chunked_array(
93+
[
94+
pa.StructArray.from_arrays(
95+
arrays=[a, b],
96+
names=["a", "b"],
97+
)
98+
]
99+
* 3
100+
)
101+
output_array = align_chunked_struct_list_offsets(input_array)
102+
assert output_array.equals(input_array)
103+
104+
# Input is a "non-aligned" chunked array
105+
a = pa.array([[0, 0, 0], [1, 2], [3, 4], [], [5, 6, 7]])[1:]
106+
b = pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]])
107+
input_array = pa.chunked_array(
108+
[
109+
pa.StructArray.from_arrays(
110+
arrays=[a, b],
111+
names=["a", "b"],
112+
)
113+
]
114+
* 4
115+
)
116+
output_array = align_chunked_struct_list_offsets(input_array)
117+
assert output_array.equals(input_array)
118+
119+
120+
def test_validate_struct_list_type():
121+
"""Test validate_struct_list_type function."""
122+
with pytest.raises(ValueError):
123+
validate_struct_list_type(pa.float64())
124+
125+
with pytest.raises(ValueError):
126+
validate_struct_list_type(pa.list_(pa.struct({"a": pa.int64()})))
127+
128+
with pytest.raises(ValueError):
129+
validate_struct_list_type(pa.struct({"a": pa.float64()}))
130+
131+
with pytest.raises(ValueError):
132+
validate_struct_list_type(pa.struct({"a": pa.list_(pa.float64()), "b": pa.float64()}))
133+
134+
assert (
135+
validate_struct_list_type(pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.float64())}))
136+
is None
137+
)
47138

48139

49140
def test_transpose_struct_list_type():

0 commit comments

Comments
 (0)