Merge pull request #303 from lincc-frameworks/fix-assign-offset

hombit · web-flow · commit c5dc9c7a44a1 · 2025-07-03T13:04:56.000-04:00
Fix assignment with different chunks
diff --git a/pyproject.toml b/pyproject.toml
@@ -49,6 +49,9 @@ requires = [
 ]
 build-backend = "setuptools.build_meta"
 
+[tool.mypy]
+python_version = "3.10"
+
 [tool.setuptools_scm]
 write_to = "src/nested_pandas/_version.py"
 
diff --git a/src/nested_pandas/series/_storage/list_struct_storage.py b/src/nested_pandas/series/_storage/list_struct_storage.py
@@ -80,5 +80,5 @@ def type(self) -> pa.ListType:
 
     @property
     def num_chunks(self) -> int:
-        """Number of chunks in the underlying array."""
+        """Number of chunk_lens in the underlying array."""
         return self._data.num_chunks
diff --git a/src/nested_pandas/series/dtype.py b/src/nested_pandas/series/dtype.py
@@ -1,5 +1,4 @@
-# Use Self, which is not available until Python 3.11
-from __future__ import annotations
+from __future__ import annotations  # Self is not available in python 3.10
 
 from collections.abc import Mapping
 
@@ -39,9 +38,9 @@ class NestedDtype(ExtensionDtype):
     """Attributes to use as metadata for __eq__ and __hash__"""
 
     @property
-    def na_value(self) -> Type[pd.NA]:
+    def na_value(self) -> Type[pd.NA]:  # type: ignore[valid-type]
         """The missing value for this dtype"""
-        return pd.NA
+        return pd.NA  # type: ignore[return-value]
 
     type = pd.DataFrame
     """The type of the array's elements, always pd.DataFrame"""
@@ -57,6 +56,10 @@ def name(self) -> str:
         fields = ", ".join([f"{field}: [{dtype!s}]" for field, dtype in nice_dtypes.items()])
         return f"nested<{fields}>"
 
+    @name.setter
+    def name(self, value: str):
+        raise TypeError("name cannot be changed")
+
     def __repr__(self) -> str:
         return self.name
 
diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py
@@ -32,28 +32,33 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# typing.Self and "|" union syntax don't exist in Python 3.9
-from __future__ import annotations
+from __future__ import annotations  # Self in Python 3.10
 
 from collections.abc import Callable, Generator, Iterable, Iterator, Sequence
 from typing import Any, cast
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from numpy.typing import ArrayLike
+from numpy.typing import ArrayLike, DTypeLike
 from pandas import Index
-from pandas._typing import InterpolateOptions, Self
+from pandas._typing import InterpolateOptions
 from pandas.api.extensions import no_default
-from pandas.core.arrays import ArrowExtensionArray, ExtensionArray
+from pandas.core.arrays import ArrowExtensionArray, ExtensionArray  # type: ignore[attr-defined]
 from pandas.core.dtypes.common import is_float_dtype
-from pandas.core.indexers import check_array_indexer, unpack_tuple_and_ellipses, validate_indices
-from pandas.io.formats.format import format_array
+from pandas.core.indexers import (  # type: ignore[attr-defined]
+    check_array_indexer,
+    unpack_tuple_and_ellipses,
+    validate_indices,
+)
+from pandas.io.formats.format import format_array  # type: ignore[attr-defined]
 
 from nested_pandas.series._storage import ListStructStorage, StructListStorage, TableStorage  # noqa
 from nested_pandas.series.dtype import NestedDtype
 from nested_pandas.series.utils import (
+    chunk_lengths,
     is_pa_type_a_list,
+    rechunk,
     transpose_struct_list_type,
 )
 
@@ -82,7 +87,7 @@
 purposes only and should never be used for anything else.
 """
 try:
-    from pandas.io.formats.format import _ExtensionArrayFormatter
+    from pandas.io.formats.format import _ExtensionArrayFormatter  # type: ignore[attr-defined]
 except ImportError:
     BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK = False
 
@@ -186,7 +191,8 @@ def convert_df_to_pa_scalar(df: pd.DataFrame, *, pa_type: pa.StructType | None)
     for column in columns:
         series = df[column]
         if isinstance(series.dtype, NestedDtype):
-            scalar = series.array.to_pyarrow_scalar(list_struct=True)
+            # We do know that array is NestedExtensionArray and does have .to_pyarrow_scalar
+            scalar = series.array.to_pyarrow_scalar(list_struct=True)  # type: ignore[attr-defined]
             ty = scalar.type
         else:
             array = pa.array(series)
@@ -271,15 +277,16 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self:  #
 
     # Tricky to implement but required by things like pd.read_csv
     @classmethod
-    def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False) -> Self:  # type: ignore[name-defined] # noqa: F821
-        return super()._from_sequence_of_strings(strings, dtype=dtype, copy=copy)
+    def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False) -> Self:  # type: ignore[name-defined, misc] # noqa: F821
+        # I don't know why mypy complains, the method IS in the base class
+        return super()._from_sequence_of_strings(strings, dtype=dtype, copy=copy)  # type: ignore[misc]
 
     # We do not implement it. ArrowExtensionArray does not implement it for struct arrays
     @classmethod
     def _from_factorized(cls, values, original):
         return super()._from_factorized(values, original)
 
-    def __getitem__(self, item) -> Self | pd.DataFrame:  # type: ignore[name-defined] # noqa: F821
+    def __getitem__(self, item: ScalarIndexer) -> Self | pd.DataFrame:  # type: ignore[name-defined, override] # noqa: F821
         item = check_array_indexer(self, item)
 
         if isinstance(item, np.ndarray):
@@ -309,7 +316,7 @@ def __getitem__(self, item) -> Self | pd.DataFrame:  # type: ignore[name-defined
         return type(self)(pa_array, validate=False)
 
     def __setitem__(self, key, value) -> None:
-        # TODO: optimize for many chunks
+        # TODO: optimize for many chunk_lens
         # https://github.com/lincc-frameworks/nested-pandas/issues/53
 
         key = check_array_indexer(self, key)
@@ -369,7 +376,9 @@ def __iter__(self) -> Iterator[pd.DataFrame]:
     def __eq__(self, other):
         return super().__eq__(other)
 
-    def to_numpy(self, dtype: None = None, copy: bool = False, na_value: Any = no_default) -> np.ndarray:
+    def to_numpy(
+        self, dtype: DTypeLike | None = None, copy: bool = False, na_value: Any = no_default
+    ) -> np.ndarray:
         """Convert the extension array to a numpy array.
 
         Parameters
@@ -441,7 +450,7 @@ def interpolate(
         **kwargs,
     ) -> Self:  # type: ignore[name-defined] # noqa: F821
         """Interpolate missing values, not implemented yet."""
-        super().interpolate(
+        return super().interpolate(  # type: ignore[misc]
             method=method,
             axis=axis,
             index=index,
@@ -587,7 +596,7 @@ def equals(self, other) -> bool:
             return False
         return self._storage == other._storage
 
-    def dropna(self) -> Self:
+    def dropna(self) -> Self:  # type: ignore[name-defined] # noqa: F821
         """Return a new ExtensionArray with missing values removed.
 
         Note that this applies to the top-level struct array, not to the list arrays.
@@ -741,12 +750,18 @@ def _convert_struct_scalar_to_df(
             return na_value
         series = {}
         for name, list_scalar in value.items():
-            dtype = self.dtype.field_dtype(name)
+            dtype: pd.ArrowDtype | NestedDtype | None = self.dtype.field_dtype(name)
             # It gave pd.ArrowDtype for non-NestedDtype fields,
             # make it None if we'd like to use pandas "ordinary" dtypes.
             if not pyarrow_dtypes and not isinstance(dtype, NestedDtype):
                 dtype = None
-            series[name] = pd.Series(list_scalar.values, dtype=dtype, copy=copy, name=name)
+            series[name] = pd.Series(
+                list_scalar.values,
+                # mypy doesn't understand that dtype is ExtensionDtype | None
+                dtype=dtype,  # type: ignore[arg-type]
+                copy=copy,
+                name=name,
+            )
         return pd.DataFrame(series, copy=False)
 
     @property
@@ -917,7 +932,7 @@ def flat_length(self) -> int:
 
     @property
     def num_chunks(self) -> int:
-        """Number of chunks in underlying pyarrow.ChunkedArray"""
+        """Number of chunk_lens in underlying pyarrow.ChunkedArray"""
         return self._storage.num_chunks
 
     def get_list_index(self) -> np.ndarray:
@@ -1076,6 +1091,8 @@ def set_list_field(self, field: str, value: ArrayLike, *, keep_dtype: bool = Fal
         if len(pa_array) != len(self):
             raise ValueError("The length of the list-array must be equal to the length of the series")
 
+        pa_array = rechunk(pa_array, chunk_lengths(self.pa_table.column(0)))
+
         if field in self.field_names:
             field_idx = self.field_names.index(field)
             pa_table = self.pa_table.drop(field).add_column(field_idx, field, pa_array)
diff --git a/src/nested_pandas/series/utils.py b/src/nested_pandas/series/utils.py
@@ -1,9 +1,11 @@
-from __future__ import annotations  # Python 3.9 requires it for X | Y type hints
+from __future__ import annotations  # TYPE_CHECKING
 
 from typing import TYPE_CHECKING, cast
 
+import numpy as np
 import pandas as pd
 import pyarrow as pa
+from numpy.typing import ArrayLike
 
 if TYPE_CHECKING:
     from nested_pandas.series.dtype import NestedDtype
@@ -314,3 +316,41 @@ def table_from_struct_array(array: pa.ChunkedArray | pa.array) -> pa.Table:
     if isinstance(array, pa.ChunkedArray) and array.num_chunks == 0:
         array = pa.array([], type=array.type)
     return pa.Table.from_struct_array(array)
+
+
+def chunk_lengths(array: pa.ChunkedArray) -> list[int]:
+    """Get the length of each chunk in an array."""
+    return [len(chunk) for chunk in array.iterchunks()]
+
+
+def rechunk(array: pa.Array | pa.ChunkedArray, chunk_lens: ArrayLike) -> pa.ChunkedArray:
+    """Rechunk array to the same chunks a given chunked array.
+
+    If no rechunk is needed the original chunked array is returned.
+
+    Parameters
+    ----------
+    array : pa.Array | pa.ChunkedArray
+        Input chunked or non-chunked array to rechunk.
+    chunk_lens : array-like of int
+        Lengths of chunks.
+
+    Returns
+    -------
+    pa.ChunkedArray
+        Rechunked `array`.
+    """
+    if len(array) != np.sum(chunk_lens):
+        raise ValueError("Input array must have the same length as the total chunk lengths")
+    if isinstance(array, pa.Array):
+        array = pa.chunked_array([array])
+
+    # Shortcut if no rechunk is needed:
+    if chunk_lengths(array) == chunk_lens:
+        return array
+    chunk_indices = np.r_[0, np.cumsum(chunk_lens)]
+    chunks = []
+    for idx_start, idx_end in zip(chunk_indices[:-1], chunk_indices[1:], strict=True):
+        chunk = array[idx_start:idx_end].combine_chunks()
+        chunks.append(chunk)
+    return pa.chunked_array(chunks)
diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py
@@ -1608,3 +1608,24 @@ def test_auto_nest_on_dataframe_assignment():
         assert (flat.values == nested.values).all()
         assert list(flat.columns) == list(nested.columns)
         assert list(flat.index) == list(nested.index)
+
+
+def test_issue294():
+    """https://github.com/lincc-frameworks/nested-pandas/issues/294"""
+    nf1 = generate_data(3, 5)
+    nf2 = generate_data(4, 6)
+    nf = pd.concat([nf1, nf2])
+    nf["c"] = range(7)
+    # Check if we did concatenation right
+    assert nf.shape[0] == 7
+    # We need multiple chunk_lens in the nested columns for the test setup
+    assert nf.nested.array.list_array.num_chunks == 2
+    # And no chunk_lens in the base column
+    c_pa_array = pa.array(nf["c"])
+    assert isinstance(c_pa_array, pa.Array) or (
+        isinstance(c_pa_array, pa.ChunkedArray) and c_pa_array.num_chunks == 1
+    )
+
+    # Failed with a ValueError in the original issue
+    nf["nested.c"] = nf["c"]
+    nf["nested.mag"] = -2.5 * np.log10(nf["nested.flux"])
diff --git a/tests/nested_pandas/series/test_ext_array.py b/tests/nested_pandas/series/test_ext_array.py
@@ -10,7 +10,7 @@
 from nested_pandas.nestedframe.core import NestedFrame
 from nested_pandas.series.ext_array import NestedExtensionArray, convert_df_to_pa_scalar, replace_with_mask
 from numpy.testing import assert_array_equal
-from pandas.core.arrays import ArrowExtensionArray
+from pandas.core.arrays import ArrowExtensionArray  # type: ignore[attr-defined]
 from pandas.testing import assert_frame_equal, assert_series_equal
 
 
@@ -688,7 +688,7 @@ def test_list_offsets_single_chunk():
 
 
 def test_list_offsets_multiple_chunks():
-    """Test that the .list_offset property is correct for multiple chunks."""
+    """Test that the .list_offset property is correct for multiple chunk_lens."""
     struct_array = pa.StructArray.from_arrays(
         arrays=[
             pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])], type=pa.list_(pa.uint8())),

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,9 @@ requires = [`
`49`	`49`	`]`
`50`	`50`	`build-backend = "setuptools.build_meta"`
`51`	`51`
	`52`	`+[tool.mypy]`
	`53`	`+python_version = "3.10"`
	`54`	`+`
`52`	`55`	`[tool.setuptools_scm]`
`53`	`56`	`write_to = "src/nested_pandas/_version.py"`
`54`	`57`