Skip to content

Commit c5dc9c7

Browse files
authored
Merge pull request #303 from lincc-frameworks/fix-assign-offset
Fix assignment with different chunks
2 parents 728d785 + 6d0e997 commit c5dc9c7

File tree

7 files changed

+111
-27
lines changed

7 files changed

+111
-27
lines changed

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ requires = [
4949
]
5050
build-backend = "setuptools.build_meta"
5151

52+
[tool.mypy]
53+
python_version = "3.10"
54+
5255
[tool.setuptools_scm]
5356
write_to = "src/nested_pandas/_version.py"
5457

src/nested_pandas/series/_storage/list_struct_storage.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,5 +80,5 @@ def type(self) -> pa.ListType:
8080

8181
@property
8282
def num_chunks(self) -> int:
83-
"""Number of chunks in the underlying array."""
83+
"""Number of chunk_lens in the underlying array."""
8484
return self._data.num_chunks

src/nested_pandas/series/dtype.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
# Use Self, which is not available until Python 3.11
2-
from __future__ import annotations
1+
from __future__ import annotations # Self is not available in python 3.10
32

43
from collections.abc import Mapping
54

@@ -39,9 +38,9 @@ class NestedDtype(ExtensionDtype):
3938
"""Attributes to use as metadata for __eq__ and __hash__"""
4039

4140
@property
42-
def na_value(self) -> Type[pd.NA]:
41+
def na_value(self) -> Type[pd.NA]: # type: ignore[valid-type]
4342
"""The missing value for this dtype"""
44-
return pd.NA
43+
return pd.NA # type: ignore[return-value]
4544

4645
type = pd.DataFrame
4746
"""The type of the array's elements, always pd.DataFrame"""
@@ -57,6 +56,10 @@ def name(self) -> str:
5756
fields = ", ".join([f"{field}: [{dtype!s}]" for field, dtype in nice_dtypes.items()])
5857
return f"nested<{fields}>"
5958

59+
@name.setter
60+
def name(self, value: str):
61+
raise TypeError("name cannot be changed")
62+
6063
def __repr__(self) -> str:
6164
return self.name
6265

src/nested_pandas/series/ext_array.py

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -32,28 +32,33 @@
3232
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
3333
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3434

35-
# typing.Self and "|" union syntax don't exist in Python 3.9
36-
from __future__ import annotations
35+
from __future__ import annotations # Self in Python 3.10
3736

3837
from collections.abc import Callable, Generator, Iterable, Iterator, Sequence
3938
from typing import Any, cast
4039

4140
import numpy as np
4241
import pandas as pd
4342
import pyarrow as pa
44-
from numpy.typing import ArrayLike
43+
from numpy.typing import ArrayLike, DTypeLike
4544
from pandas import Index
46-
from pandas._typing import InterpolateOptions, Self
45+
from pandas._typing import InterpolateOptions
4746
from pandas.api.extensions import no_default
48-
from pandas.core.arrays import ArrowExtensionArray, ExtensionArray
47+
from pandas.core.arrays import ArrowExtensionArray, ExtensionArray # type: ignore[attr-defined]
4948
from pandas.core.dtypes.common import is_float_dtype
50-
from pandas.core.indexers import check_array_indexer, unpack_tuple_and_ellipses, validate_indices
51-
from pandas.io.formats.format import format_array
49+
from pandas.core.indexers import ( # type: ignore[attr-defined]
50+
check_array_indexer,
51+
unpack_tuple_and_ellipses,
52+
validate_indices,
53+
)
54+
from pandas.io.formats.format import format_array # type: ignore[attr-defined]
5255

5356
from nested_pandas.series._storage import ListStructStorage, StructListStorage, TableStorage # noqa
5457
from nested_pandas.series.dtype import NestedDtype
5558
from nested_pandas.series.utils import (
59+
chunk_lengths,
5660
is_pa_type_a_list,
61+
rechunk,
5762
transpose_struct_list_type,
5863
)
5964

@@ -82,7 +87,7 @@
8287
purposes only and should never be used for anything else.
8388
"""
8489
try:
85-
from pandas.io.formats.format import _ExtensionArrayFormatter
90+
from pandas.io.formats.format import _ExtensionArrayFormatter # type: ignore[attr-defined]
8691
except ImportError:
8792
BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK = False
8893

@@ -186,7 +191,8 @@ def convert_df_to_pa_scalar(df: pd.DataFrame, *, pa_type: pa.StructType | None)
186191
for column in columns:
187192
series = df[column]
188193
if isinstance(series.dtype, NestedDtype):
189-
scalar = series.array.to_pyarrow_scalar(list_struct=True)
194+
# We do know that array is NestedExtensionArray and does have .to_pyarrow_scalar
195+
scalar = series.array.to_pyarrow_scalar(list_struct=True) # type: ignore[attr-defined]
190196
ty = scalar.type
191197
else:
192198
array = pa.array(series)
@@ -271,15 +277,16 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: #
271277

272278
# Tricky to implement but required by things like pd.read_csv
273279
@classmethod
274-
def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False) -> Self: # type: ignore[name-defined] # noqa: F821
275-
return super()._from_sequence_of_strings(strings, dtype=dtype, copy=copy)
280+
def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False) -> Self: # type: ignore[name-defined, misc] # noqa: F821
281+
# I don't know why mypy complains, the method IS in the base class
282+
return super()._from_sequence_of_strings(strings, dtype=dtype, copy=copy) # type: ignore[misc]
276283

277284
# We do not implement it. ArrowExtensionArray does not implement it for struct arrays
278285
@classmethod
279286
def _from_factorized(cls, values, original):
280287
return super()._from_factorized(values, original)
281288

282-
def __getitem__(self, item) -> Self | pd.DataFrame: # type: ignore[name-defined] # noqa: F821
289+
def __getitem__(self, item: ScalarIndexer) -> Self | pd.DataFrame: # type: ignore[name-defined, override] # noqa: F821
283290
item = check_array_indexer(self, item)
284291

285292
if isinstance(item, np.ndarray):
@@ -309,7 +316,7 @@ def __getitem__(self, item) -> Self | pd.DataFrame: # type: ignore[name-defined
309316
return type(self)(pa_array, validate=False)
310317

311318
def __setitem__(self, key, value) -> None:
312-
# TODO: optimize for many chunks
319+
# TODO: optimize for many chunk_lens
313320
# https://github.com/lincc-frameworks/nested-pandas/issues/53
314321

315322
key = check_array_indexer(self, key)
@@ -369,7 +376,9 @@ def __iter__(self) -> Iterator[pd.DataFrame]:
369376
def __eq__(self, other):
370377
return super().__eq__(other)
371378

372-
def to_numpy(self, dtype: None = None, copy: bool = False, na_value: Any = no_default) -> np.ndarray:
379+
def to_numpy(
380+
self, dtype: DTypeLike | None = None, copy: bool = False, na_value: Any = no_default
381+
) -> np.ndarray:
373382
"""Convert the extension array to a numpy array.
374383
375384
Parameters
@@ -441,7 +450,7 @@ def interpolate(
441450
**kwargs,
442451
) -> Self: # type: ignore[name-defined] # noqa: F821
443452
"""Interpolate missing values, not implemented yet."""
444-
super().interpolate(
453+
return super().interpolate( # type: ignore[misc]
445454
method=method,
446455
axis=axis,
447456
index=index,
@@ -587,7 +596,7 @@ def equals(self, other) -> bool:
587596
return False
588597
return self._storage == other._storage
589598

590-
def dropna(self) -> Self:
599+
def dropna(self) -> Self: # type: ignore[name-defined] # noqa: F821
591600
"""Return a new ExtensionArray with missing values removed.
592601
593602
Note that this applies to the top-level struct array, not to the list arrays.
@@ -741,12 +750,18 @@ def _convert_struct_scalar_to_df(
741750
return na_value
742751
series = {}
743752
for name, list_scalar in value.items():
744-
dtype = self.dtype.field_dtype(name)
753+
dtype: pd.ArrowDtype | NestedDtype | None = self.dtype.field_dtype(name)
745754
# It gave pd.ArrowDtype for non-NestedDtype fields,
746755
# make it None if we'd like to use pandas "ordinary" dtypes.
747756
if not pyarrow_dtypes and not isinstance(dtype, NestedDtype):
748757
dtype = None
749-
series[name] = pd.Series(list_scalar.values, dtype=dtype, copy=copy, name=name)
758+
series[name] = pd.Series(
759+
list_scalar.values,
760+
# mypy doesn't understand that dtype is ExtensionDtype | None
761+
dtype=dtype, # type: ignore[arg-type]
762+
copy=copy,
763+
name=name,
764+
)
750765
return pd.DataFrame(series, copy=False)
751766

752767
@property
@@ -917,7 +932,7 @@ def flat_length(self) -> int:
917932

918933
@property
919934
def num_chunks(self) -> int:
920-
"""Number of chunks in underlying pyarrow.ChunkedArray"""
935+
"""Number of chunk_lens in underlying pyarrow.ChunkedArray"""
921936
return self._storage.num_chunks
922937

923938
def get_list_index(self) -> np.ndarray:
@@ -1076,6 +1091,8 @@ def set_list_field(self, field: str, value: ArrayLike, *, keep_dtype: bool = Fal
10761091
if len(pa_array) != len(self):
10771092
raise ValueError("The length of the list-array must be equal to the length of the series")
10781093

1094+
pa_array = rechunk(pa_array, chunk_lengths(self.pa_table.column(0)))
1095+
10791096
if field in self.field_names:
10801097
field_idx = self.field_names.index(field)
10811098
pa_table = self.pa_table.drop(field).add_column(field_idx, field, pa_array)

src/nested_pandas/series/utils.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1-
from __future__ import annotations # Python 3.9 requires it for X | Y type hints
1+
from __future__ import annotations # TYPE_CHECKING
22

33
from typing import TYPE_CHECKING, cast
44

5+
import numpy as np
56
import pandas as pd
67
import pyarrow as pa
8+
from numpy.typing import ArrayLike
79

810
if TYPE_CHECKING:
911
from nested_pandas.series.dtype import NestedDtype
@@ -314,3 +316,41 @@ def table_from_struct_array(array: pa.ChunkedArray | pa.array) -> pa.Table:
314316
if isinstance(array, pa.ChunkedArray) and array.num_chunks == 0:
315317
array = pa.array([], type=array.type)
316318
return pa.Table.from_struct_array(array)
319+
320+
321+
def chunk_lengths(array: pa.ChunkedArray) -> list[int]:
322+
"""Get the length of each chunk in an array."""
323+
return [len(chunk) for chunk in array.iterchunks()]
324+
325+
326+
def rechunk(array: pa.Array | pa.ChunkedArray, chunk_lens: ArrayLike) -> pa.ChunkedArray:
327+
"""Rechunk array to the same chunks a given chunked array.
328+
329+
If no rechunk is needed the original chunked array is returned.
330+
331+
Parameters
332+
----------
333+
array : pa.Array | pa.ChunkedArray
334+
Input chunked or non-chunked array to rechunk.
335+
chunk_lens : array-like of int
336+
Lengths of chunks.
337+
338+
Returns
339+
-------
340+
pa.ChunkedArray
341+
Rechunked `array`.
342+
"""
343+
if len(array) != np.sum(chunk_lens):
344+
raise ValueError("Input array must have the same length as the total chunk lengths")
345+
if isinstance(array, pa.Array):
346+
array = pa.chunked_array([array])
347+
348+
# Shortcut if no rechunk is needed:
349+
if chunk_lengths(array) == chunk_lens:
350+
return array
351+
chunk_indices = np.r_[0, np.cumsum(chunk_lens)]
352+
chunks = []
353+
for idx_start, idx_end in zip(chunk_indices[:-1], chunk_indices[1:], strict=True):
354+
chunk = array[idx_start:idx_end].combine_chunks()
355+
chunks.append(chunk)
356+
return pa.chunked_array(chunks)

tests/nested_pandas/nestedframe/test_nestedframe.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1608,3 +1608,24 @@ def test_auto_nest_on_dataframe_assignment():
16081608
assert (flat.values == nested.values).all()
16091609
assert list(flat.columns) == list(nested.columns)
16101610
assert list(flat.index) == list(nested.index)
1611+
1612+
1613+
def test_issue294():
1614+
"""https://github.com/lincc-frameworks/nested-pandas/issues/294"""
1615+
nf1 = generate_data(3, 5)
1616+
nf2 = generate_data(4, 6)
1617+
nf = pd.concat([nf1, nf2])
1618+
nf["c"] = range(7)
1619+
# Check if we did concatenation right
1620+
assert nf.shape[0] == 7
1621+
# We need multiple chunk_lens in the nested columns for the test setup
1622+
assert nf.nested.array.list_array.num_chunks == 2
1623+
# And no chunk_lens in the base column
1624+
c_pa_array = pa.array(nf["c"])
1625+
assert isinstance(c_pa_array, pa.Array) or (
1626+
isinstance(c_pa_array, pa.ChunkedArray) and c_pa_array.num_chunks == 1
1627+
)
1628+
1629+
# Failed with a ValueError in the original issue
1630+
nf["nested.c"] = nf["c"]
1631+
nf["nested.mag"] = -2.5 * np.log10(nf["nested.flux"])

tests/nested_pandas/series/test_ext_array.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from nested_pandas.nestedframe.core import NestedFrame
1111
from nested_pandas.series.ext_array import NestedExtensionArray, convert_df_to_pa_scalar, replace_with_mask
1212
from numpy.testing import assert_array_equal
13-
from pandas.core.arrays import ArrowExtensionArray
13+
from pandas.core.arrays import ArrowExtensionArray # type: ignore[attr-defined]
1414
from pandas.testing import assert_frame_equal, assert_series_equal
1515

1616

@@ -688,7 +688,7 @@ def test_list_offsets_single_chunk():
688688

689689

690690
def test_list_offsets_multiple_chunks():
691-
"""Test that the .list_offset property is correct for multiple chunks."""
691+
"""Test that the .list_offset property is correct for multiple chunk_lens."""
692692
struct_array = pa.StructArray.from_arrays(
693693
arrays=[
694694
pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])], type=pa.list_(pa.uint8())),

0 commit comments

Comments
 (0)