|
32 | 32 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
33 | 33 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
34 | 34 |
|
35 |
| -# typing.Self and "|" union syntax don't exist in Python 3.9 |
36 |
| -from __future__ import annotations |
| 35 | +from __future__ import annotations # Self in Python 3.10 |
37 | 36 |
|
38 | 37 | from collections.abc import Callable, Generator, Iterable, Iterator, Sequence
|
39 | 38 | from typing import Any, cast
|
40 | 39 |
|
41 | 40 | import numpy as np
|
42 | 41 | import pandas as pd
|
43 | 42 | import pyarrow as pa
|
44 |
| -from numpy.typing import ArrayLike |
| 43 | +from numpy.typing import ArrayLike, DTypeLike |
45 | 44 | from pandas import Index
|
46 |
| -from pandas._typing import InterpolateOptions, Self |
| 45 | +from pandas._typing import InterpolateOptions |
47 | 46 | from pandas.api.extensions import no_default
|
48 |
| -from pandas.core.arrays import ArrowExtensionArray, ExtensionArray |
| 47 | +from pandas.core.arrays import ArrowExtensionArray, ExtensionArray # type: ignore[attr-defined] |
49 | 48 | from pandas.core.dtypes.common import is_float_dtype
|
50 |
| -from pandas.core.indexers import check_array_indexer, unpack_tuple_and_ellipses, validate_indices |
51 |
| -from pandas.io.formats.format import format_array |
| 49 | +from pandas.core.indexers import ( # type: ignore[attr-defined] |
| 50 | + check_array_indexer, |
| 51 | + unpack_tuple_and_ellipses, |
| 52 | + validate_indices, |
| 53 | +) |
| 54 | +from pandas.io.formats.format import format_array # type: ignore[attr-defined] |
52 | 55 |
|
53 | 56 | from nested_pandas.series._storage import ListStructStorage, StructListStorage, TableStorage # noqa
|
54 | 57 | from nested_pandas.series.dtype import NestedDtype
|
55 | 58 | from nested_pandas.series.utils import (
|
| 59 | + chunk_lengths, |
56 | 60 | is_pa_type_a_list,
|
| 61 | + rechunk, |
57 | 62 | transpose_struct_list_type,
|
58 | 63 | )
|
59 | 64 |
|
|
82 | 87 | purposes only and should never be used for anything else.
|
83 | 88 | """
|
84 | 89 | try:
|
85 |
| - from pandas.io.formats.format import _ExtensionArrayFormatter |
| 90 | + from pandas.io.formats.format import _ExtensionArrayFormatter # type: ignore[attr-defined] |
86 | 91 | except ImportError:
|
87 | 92 | BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK = False
|
88 | 93 |
|
@@ -186,7 +191,8 @@ def convert_df_to_pa_scalar(df: pd.DataFrame, *, pa_type: pa.StructType | None)
|
186 | 191 | for column in columns:
|
187 | 192 | series = df[column]
|
188 | 193 | if isinstance(series.dtype, NestedDtype):
|
189 |
| - scalar = series.array.to_pyarrow_scalar(list_struct=True) |
| 194 | + # We do know that array is NestedExtensionArray and does have .to_pyarrow_scalar |
| 195 | + scalar = series.array.to_pyarrow_scalar(list_struct=True) # type: ignore[attr-defined] |
190 | 196 | ty = scalar.type
|
191 | 197 | else:
|
192 | 198 | array = pa.array(series)
|
@@ -271,15 +277,16 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: #
|
271 | 277 |
|
272 | 278 | # Tricky to implement but required by things like pd.read_csv
|
273 | 279 | @classmethod
|
274 |
| - def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False) -> Self: # type: ignore[name-defined] # noqa: F821 |
275 |
| - return super()._from_sequence_of_strings(strings, dtype=dtype, copy=copy) |
| 280 | + def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False) -> Self: # type: ignore[name-defined, misc] # noqa: F821 |
| 281 | + # I don't know why mypy complains, the method IS in the base class |
| 282 | + return super()._from_sequence_of_strings(strings, dtype=dtype, copy=copy) # type: ignore[misc] |
276 | 283 |
|
277 | 284 | # We do not implement it. ArrowExtensionArray does not implement it for struct arrays
|
278 | 285 | @classmethod
|
279 | 286 | def _from_factorized(cls, values, original):
|
280 | 287 | return super()._from_factorized(values, original)
|
281 | 288 |
|
282 |
| - def __getitem__(self, item) -> Self | pd.DataFrame: # type: ignore[name-defined] # noqa: F821 |
| 289 | + def __getitem__(self, item: ScalarIndexer) -> Self | pd.DataFrame: # type: ignore[name-defined, override] # noqa: F821 |
283 | 290 | item = check_array_indexer(self, item)
|
284 | 291 |
|
285 | 292 | if isinstance(item, np.ndarray):
|
@@ -309,7 +316,7 @@ def __getitem__(self, item) -> Self | pd.DataFrame: # type: ignore[name-defined
|
309 | 316 | return type(self)(pa_array, validate=False)
|
310 | 317 |
|
311 | 318 | def __setitem__(self, key, value) -> None:
|
312 |
| - # TODO: optimize for many chunks |
| 319 | + # TODO: optimize for many chunk_lens |
313 | 320 | # https://github.com/lincc-frameworks/nested-pandas/issues/53
|
314 | 321 |
|
315 | 322 | key = check_array_indexer(self, key)
|
@@ -369,7 +376,9 @@ def __iter__(self) -> Iterator[pd.DataFrame]:
|
369 | 376 | def __eq__(self, other):
|
370 | 377 | return super().__eq__(other)
|
371 | 378 |
|
372 |
| - def to_numpy(self, dtype: None = None, copy: bool = False, na_value: Any = no_default) -> np.ndarray: |
| 379 | + def to_numpy( |
| 380 | + self, dtype: DTypeLike | None = None, copy: bool = False, na_value: Any = no_default |
| 381 | + ) -> np.ndarray: |
373 | 382 | """Convert the extension array to a numpy array.
|
374 | 383 |
|
375 | 384 | Parameters
|
@@ -441,7 +450,7 @@ def interpolate(
|
441 | 450 | **kwargs,
|
442 | 451 | ) -> Self: # type: ignore[name-defined] # noqa: F821
|
443 | 452 | """Interpolate missing values, not implemented yet."""
|
444 |
| - super().interpolate( |
| 453 | + return super().interpolate( # type: ignore[misc] |
445 | 454 | method=method,
|
446 | 455 | axis=axis,
|
447 | 456 | index=index,
|
@@ -587,7 +596,7 @@ def equals(self, other) -> bool:
|
587 | 596 | return False
|
588 | 597 | return self._storage == other._storage
|
589 | 598 |
|
590 |
| - def dropna(self) -> Self: |
| 599 | + def dropna(self) -> Self: # type: ignore[name-defined] # noqa: F821 |
591 | 600 | """Return a new ExtensionArray with missing values removed.
|
592 | 601 |
|
593 | 602 | Note that this applies to the top-level struct array, not to the list arrays.
|
@@ -741,12 +750,18 @@ def _convert_struct_scalar_to_df(
|
741 | 750 | return na_value
|
742 | 751 | series = {}
|
743 | 752 | for name, list_scalar in value.items():
|
744 |
| - dtype = self.dtype.field_dtype(name) |
| 753 | + dtype: pd.ArrowDtype | NestedDtype | None = self.dtype.field_dtype(name) |
745 | 754 | # It gave pd.ArrowDtype for non-NestedDtype fields,
|
746 | 755 | # make it None if we'd like to use pandas "ordinary" dtypes.
|
747 | 756 | if not pyarrow_dtypes and not isinstance(dtype, NestedDtype):
|
748 | 757 | dtype = None
|
749 |
| - series[name] = pd.Series(list_scalar.values, dtype=dtype, copy=copy, name=name) |
| 758 | + series[name] = pd.Series( |
| 759 | + list_scalar.values, |
| 760 | + # mypy doesn't understand that dtype is ExtensionDtype | None |
| 761 | + dtype=dtype, # type: ignore[arg-type] |
| 762 | + copy=copy, |
| 763 | + name=name, |
| 764 | + ) |
750 | 765 | return pd.DataFrame(series, copy=False)
|
751 | 766 |
|
752 | 767 | @property
|
@@ -917,7 +932,7 @@ def flat_length(self) -> int:
|
917 | 932 |
|
918 | 933 | @property
|
919 | 934 | def num_chunks(self) -> int:
|
920 |
| - """Number of chunks in underlying pyarrow.ChunkedArray""" |
| 935 | + """Number of chunk_lens in underlying pyarrow.ChunkedArray""" |
921 | 936 | return self._storage.num_chunks
|
922 | 937 |
|
923 | 938 | def get_list_index(self) -> np.ndarray:
|
@@ -1076,6 +1091,8 @@ def set_list_field(self, field: str, value: ArrayLike, *, keep_dtype: bool = Fal
|
1076 | 1091 | if len(pa_array) != len(self):
|
1077 | 1092 | raise ValueError("The length of the list-array must be equal to the length of the series")
|
1078 | 1093 |
|
| 1094 | + pa_array = rechunk(pa_array, chunk_lengths(self.pa_table.column(0))) |
| 1095 | + |
1079 | 1096 | if field in self.field_names:
|
1080 | 1097 | field_idx = self.field_names.index(field)
|
1081 | 1098 | pa_table = self.pa_table.drop(field).add_column(field_idx, field, pa_array)
|
|
0 commit comments