diff --git a/python/pyarrow-stubs/pyarrow/_csv.pyi b/python/pyarrow-stubs/pyarrow/_csv.pyi new file mode 100644 index 000000000000..6c911a8b0c1d --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_csv.pyi @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Callable, Sequence +from dataclasses import dataclass, field +from typing import IO, Any, Literal + +from _typeshed import StrPath + +from . import lib + + +@dataclass(kw_only=True) +class ReadOptions(lib._Weakrefable): + use_threads: bool = field(default=True, kw_only=False) # noqa: Y015 + block_size: int | float | None = None + skip_rows: int = 0 + skip_rows_after_names: int = 0 + column_names: Sequence[str] | None = None + autogenerate_column_names: bool = False + encoding: str = "utf8" + def validate(self) -> None: ... + + +@dataclass(kw_only=True) +class ParseOptions(lib._Weakrefable): + delimiter: str = field(default=",", kw_only=False) # noqa: Y015 + quote_char: str | Literal[False] = '"' + double_quote: bool = True + escape_char: str | Literal[False] = False + newlines_in_values: bool = False + ignore_empty_lines: bool = True + invalid_row_handler: Callable[[InvalidRow], str] | None = None + + def validate(self) -> None: ... + + +@dataclass(kw_only=True) +class ConvertOptions(lib._Weakrefable): + check_utf8: bool = field(default=True, kw_only=False) # noqa: Y015 + column_types: lib.Schema | dict | Sequence[tuple[str, lib.DataType]] | None = None + null_values: list[str] | None = None + true_values: list[str] | None = None + false_values: list[str] | None = None + decimal_point: str = "." + strings_can_be_null: bool = False + quoted_strings_can_be_null: bool = True + include_columns: list[str] | None = None + include_missing_columns: bool = False + auto_dict_encode: bool = False + auto_dict_max_cardinality: int | None = None + timestamp_parsers: Sequence[str | lib._Weakrefable] | None = None + + def validate(self) -> None: ... + + +@dataclass(kw_only=True) +class WriteOptions(lib._Weakrefable): + include_header: bool = field(default=True, kw_only=False) # noqa: Y015 + batch_size: int = 1024 + delimiter: str = "," + quoting_style: Literal["needed", "all_valid", "none"] = "needed" + quoting_header: Literal["needed", "all_valid", "none"] = "needed" + + def validate(self) -> None: ... + + +@dataclass +class InvalidRow(lib._Weakrefable): + expected_columns: int + actual_columns: int + number: int | None + text: str + + +class CSVWriter(lib._CRecordBatchWriter): + def __init__( + self, + # TODO: OutputStream + sink: StrPath | IO[Any], + schema: lib.Schema, + write_options: WriteOptions | None = None, + *, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + + +class CSVStreamingReader(lib.RecordBatchReader): + ... + + +ISO8601: lib._Weakrefable + + +def open_csv( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> CSVStreamingReader: ... + + +def read_csv( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Table: ... + + +def write_csv( + data: lib.RecordBatch | lib.Table, + output_file: StrPath | lib.NativeFile | IO[Any], + write_options: WriteOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/_feather.pyi b/python/pyarrow-stubs/pyarrow/_feather.pyi new file mode 100644 index 000000000000..2f4757cd5f1a --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_feather.pyi @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import IO, Literal +from collections.abc import Sequence + +from _typeshed import StrPath + +from .lib import Buffer, NativeFile, Table, _Weakrefable + + +class FeatherError(Exception): + ... + + +def write_feather( + table: Table, + dest: StrPath | IO | NativeFile, + compression: str | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: Literal[1, 2] = 2, +): ... + + +class FeatherReader(_Weakrefable): + def __init__( + self, + source: StrPath | IO | NativeFile | Buffer, + use_memory_map: bool, + use_threads: bool, + ) -> None: ... + @property + def version(self) -> str: ... + def read(self) -> Table: ... + def read_indices(self, indices: Sequence[int]) -> Table: ... + def read_names(self, names: Sequence[str]) -> Table: ... diff --git a/python/pyarrow-stubs/pyarrow/_ipc.pyi b/python/pyarrow-stubs/pyarrow/_ipc.pyi new file mode 100644 index 000000000000..5a87f2439046 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_ipc.pyi @@ -0,0 +1,317 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum +import sys + +from io import IOBase + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +from collections.abc import Iterable, Iterator, Mapping +from typing import Any, Literal, NamedTuple + +import pandas as pd + +from pyarrow._stubs_typing import SupportPyBuffer +from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakrefable + +from .io import Buffer, Codec, NativeFile, BufferReader +from ._types import DictionaryMemo, KeyValueMetadata + + +class MetadataVersion(enum.IntEnum): + V1 = enum.auto() + V2 = enum.auto() + V3 = enum.auto() + V4 = enum.auto() + V5 = enum.auto() + + +class Alignment(enum.IntEnum): + Any = enum.auto() + At64Byte = enum.auto() + DataTypeSpecific = enum.auto() + + +class WriteStats(NamedTuple): + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + + +class ReadStats(NamedTuple): + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + + +class IpcReadOptions(_Weakrefable): + ensure_native_endian: bool + use_threads: bool + ensure_alignment: Alignment + included_fields: list[int] | None + + def __init__( + self, + *, + ensure_native_endian: bool = True, + use_threads: bool = True, + ensure_alignment: Alignment = ..., + included_fields: list[int] | None = None, + ) -> None: ... + + +class IpcWriteOptions(_Weakrefable): + metadata_version: Any + allow_64bit: bool + use_legacy_format: bool + compression: Any + use_threads: bool + emit_dictionary_deltas: bool + unify_dictionaries: bool + + def __init__( + self, + *, + metadata_version: MetadataVersion = MetadataVersion.V5, + allow_64bit: bool = False, + use_legacy_format: bool = False, + compression: Codec | Literal["lz4", "zstd"] | None = None, + use_threads: bool = True, + emit_dictionary_deltas: bool = False, + unify_dictionaries: bool = False, + ) -> None: ... + + +class Message(_Weakrefable): + @property + def type(self) -> str: ... + @property + def metadata(self) -> Buffer: ... + @property + def metadata_version(self) -> MetadataVersion: ... + @property + def body(self) -> Buffer | None: ... + def equals(self, other: Message) -> bool: ... + + def serialize_to(self, sink: NativeFile, alignment: int = 8, + memory_pool: MemoryPool | None = None): ... + + def serialize(self, alignment: int = 8, memory_pool: MemoryPool | + None = None) -> Buffer: ... + + +class MessageReader(_Weakrefable): + @classmethod + def open_stream(cls, source: bytes | NativeFile | + IOBase | SupportPyBuffer) -> Self: ... + + def __iter__(self) -> Self: ... + def read_next_message(self) -> Message: ... + + __next__ = read_next_message + +# ---------------------------------------------------------------------- +# File and stream readers and writers + + +class _CRecordBatchWriter(_Weakrefable): + def write(self, table_or_batch: Table | RecordBatch): ... + + def write_batch( + self, + batch: RecordBatch, + custom_metadata: Mapping[bytes, bytes] | KeyValueMetadata | None = None, + ): ... + + def write_table(self, table: Table, max_chunksize: int | None = None) -> None: ... + + def close(self) -> None: ... + + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def stats(self) -> WriteStats: ... + + +class _RecordBatchStreamWriter(_CRecordBatchWriter): + @property + def _use_legacy_format(self) -> bool: ... + @property + def _metadata_version(self) -> MetadataVersion: ... + + def _open( + self, + sink, + schema: Schema, + options: IpcWriteOptions = IpcWriteOptions(), # noqa: Y011 + metadata: dict[bytes, bytes] | None = None, + ): ... + + +class _ReadPandasMixin: + def read_pandas(self, **options) -> pd.DataFrame: ... + + +class RecordBatchReader(_ReadPandasMixin, _Weakrefable): + def __iter__(self) -> Self: ... + def read_next_batch(self) -> RecordBatch: ... + + __next__ = read_next_batch + @property + def schema(self) -> Schema: ... + + def read_next_batch_with_custom_metadata(self) -> RecordBatchWithMetadata: ... + + def iter_batches_with_custom_metadata( + self, + ) -> Iterator[RecordBatchWithMetadata]: ... + + def read_all(self) -> Table: ... + + def close(self) -> None: ... + + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + def cast(self, target_schema: Schema) -> Self: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: ... + + def __arrow_c_stream__(self, requested_schema=None): ... + + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: ... + + @classmethod + def from_stream(cls, data: Any, + schema: Any = None) -> Self: ... + + @classmethod + def from_batches(cls, schema: Any, batches: Iterable[RecordBatch]) -> Self: ... + + +class _RecordBatchStreamReader(RecordBatchReader): + @property + def stats(self) -> ReadStats: ... + + def _open( + self, + source, + options: IpcReadOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Self: ... + + +class _RecordBatchFileWriter(_RecordBatchStreamWriter): + ... + + +class RecordBatchWithMetadata(NamedTuple): + batch: RecordBatch + custom_metadata: KeyValueMetadata + + +class _RecordBatchFileReader(_ReadPandasMixin, _Weakrefable): + @property + def num_record_batches(self) -> int: ... + + def get_batch(self, i: int) -> RecordBatch: ... + + get_record_batch = get_batch + def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: ... + + def read_all(self) -> Table: ... + + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def schema(self) -> Schema: ... + @property + def stats(self) -> ReadStats: ... + @property + def metadata(self) -> KeyValueMetadata | None: ... + + def _open( + self, + source, + footer_offset: int | None = None, + options: IpcReadOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Self: ... + + +def get_tensor_size(tensor: Tensor) -> int: ... + + +def get_record_batch_size(batch: RecordBatch) -> int: ... + + +def write_tensor(tensor: Tensor, dest: NativeFile) -> int: ... + + +def read_tensor(source: NativeFile) -> Tensor: ... + + +def read_message(source: NativeFile | IOBase | SupportPyBuffer) -> Message: ... + + +def read_schema(obj: Buffer | Message | BufferReader, dictionary_memo: DictionaryMemo | + None = None) -> Schema: ... + + +def read_record_batch( + obj: Message | SupportPyBuffer, + schema: Schema, + dictionary_memo: DictionaryMemo | None = None) -> RecordBatch: ... + + +__all__ = [ + "MetadataVersion", + "Alignment", + "WriteStats", + "ReadStats", + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "_CRecordBatchWriter", + "_RecordBatchStreamWriter", + "_ReadPandasMixin", + "RecordBatchReader", + "_RecordBatchStreamReader", + "_RecordBatchFileWriter", + "RecordBatchWithMetadata", + "_RecordBatchFileReader", + "get_tensor_size", + "get_record_batch_size", + "write_tensor", + "read_tensor", + "read_message", + "read_schema", + "read_record_batch", +] diff --git a/python/pyarrow-stubs/pyarrow/_json.pyi b/python/pyarrow-stubs/pyarrow/_json.pyi new file mode 100644 index 000000000000..bae2ff404f09 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_json.pyi @@ -0,0 +1,66 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import IO, Any, Literal + +from _typeshed import StrPath + +from .lib import MemoryPool, RecordBatchReader, Schema, Table, _Weakrefable + + +class ReadOptions(_Weakrefable): + use_threads: bool + block_size: int + + def __init__(self, use_threads: bool | None = None, + block_size: int | None = None): ... + + def equals(self, other: ReadOptions) -> bool: ... + + +class ParseOptions(_Weakrefable): + explicit_schema: Schema + newlines_in_values: bool + unexpected_field_behavior: Literal["ignore", "error", "infer"] + + def __init__( + self, + explicit_schema: Schema | None = None, + newlines_in_values: bool | None = None, + unexpected_field_behavior: Literal["ignore", "error", "infer"] = "infer", + ): ... + def equals(self, other: ParseOptions) -> bool: ... + + +class JSONStreamingReader(RecordBatchReader): + ... + + +def read_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> Table: ... + + +def open_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> JSONStreamingReader: ... diff --git a/python/pyarrow-stubs/pyarrow/_orc.pyi b/python/pyarrow-stubs/pyarrow/_orc.pyi new file mode 100644 index 000000000000..faa0f57c1fdc --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_orc.pyi @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import IO, Literal, Any + +from .lib import ( + Buffer, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, +) + + +class ORCReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def open(self, source: str | NativeFile | Buffer, use_memory_map: bool = True): ... + def metadata(self) -> KeyValueMetadata: ... + def schema(self) -> Schema: ... + def nrows(self) -> int: ... + def nstripes(self) -> int: ... + def file_version(self) -> str: ... + def software_version(self) -> str: ... + def compression(self) -> Literal["UNCOMPRESSED", + "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... + + def compression_size(self) -> int: ... + def row_index_stride(self) -> int: ... + def writer(self) -> str: ... + def writer_version(self) -> str: ... + def nstripe_statistics(self) -> int: ... + def content_length(self) -> int: ... + def stripe_statistics_length(self) -> int: ... + def file_footer_length(self) -> int: ... + def file_postscript_length(self) -> int: ... + def file_length(self) -> int: ... + def serialized_file_tail(self) -> int: ... + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: ... + def read(self, columns: list[str] | None = None) -> Table: ... + + +class ORCWriter(_Weakrefable): + def open( + self, + where: str | NativeFile | IO, + *, + file_version: str | None = None, + batch_size: int | None = None, + stripe_size: int | None = None, + compression: Any = 'UNCOMPRESSED', + compression_block_size: int | None = None, + compression_strategy: Any = 'SPEED', + row_index_stride: int | None = None, + padding_tolerance: float | None = None, + dictionary_key_size_threshold: float | None = None, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float | None = None, + ) -> None: ... + def write(self, table: Table) -> None: ... + def close(self) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/csv.pyi b/python/pyarrow-stubs/pyarrow/csv.pyi new file mode 100644 index 000000000000..a7abd413aab4 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/csv.pyi @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._csv import ( + ISO8601, + ConvertOptions, + CSVStreamingReader, + CSVWriter, + InvalidRow, + ParseOptions, + ReadOptions, + WriteOptions, + open_csv, + read_csv, + write_csv, +) + +__all__ = [ + "ISO8601", + "ConvertOptions", + "CSVStreamingReader", + "CSVWriter", + "InvalidRow", + "ParseOptions", + "ReadOptions", + "WriteOptions", + "open_csv", + "read_csv", + "write_csv", +] diff --git a/python/pyarrow-stubs/pyarrow/feather.pyi b/python/pyarrow-stubs/pyarrow/feather.pyi new file mode 100644 index 000000000000..cf9d34020913 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/feather.pyi @@ -0,0 +1,81 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Iterable +from typing import IO, Literal + +import pandas as pd + +from pyarrow import lib +from pyarrow.lib import Table +from pyarrow._typing import StrPath +from ._feather import FeatherError + + +class FeatherDataset: + path_or_paths: str | list[str] + validate_schema: bool + + def __init__(self, path_or_paths: str | + list[str], validate_schema: bool = True) -> None: ... + + def read_table(self, columns: list[str] | None = None) -> Table: ... + def validate_schemas(self, piece, table: Table) -> None: ... + + def read_pandas( + self, columns: list[str] | None = None, use_threads: bool = True + ) -> pd.DataFrame: ... + + +def check_chunked_overflow(name: str, col) -> None: ... + + +def write_feather( + df: pd.DataFrame | Table | lib.ChunkedArray, + dest: StrPath | IO, + compression: Literal["zstd", "lz4", "uncompressed", "snappy"] | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: Literal[1, 2] = 2, +) -> None: ... + + +def read_feather( + source: StrPath | IO | lib.NativeFile, + columns: list[str] | None = None, + use_threads: bool = True, + memory_map: bool = False, + **kwargs, +) -> pd.DataFrame: ... + + +def read_table( + source: StrPath | IO | lib.NativeFile, + columns: list[str | int] | Iterable[str | int] | None = None, + memory_map: bool = False, + use_threads: bool = True, +) -> Table: ... + + +__all__ = [ + "FeatherError", + "FeatherDataset", + "check_chunked_overflow", + "write_feather", + "read_feather", + "read_table", +] diff --git a/python/pyarrow-stubs/pyarrow/ipc.pyi b/python/pyarrow-stubs/pyarrow/ipc.pyi new file mode 100644 index 000000000000..d153ab0f46aa --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/ipc.pyi @@ -0,0 +1,162 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from io import IOBase +from typing import Any + +from _typeshed import StrPath +import pandas as pd +import pyarrow.lib as lib + +from pyarrow.lib import ( + Alignment, + IpcReadOptions, + IpcWriteOptions, + Message, + MessageReader, + MetadataVersion, + ReadStats, + RecordBatchReader, + WriteStats, + _ReadPandasMixin, + get_record_batch_size, + get_tensor_size, + read_message, + read_record_batch, + read_schema, + read_tensor, + write_tensor, +) + + +class RecordBatchStreamReader(lib._RecordBatchStreamReader): + def __init__( + self, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + + +class RecordBatchStreamWriter(lib._RecordBatchStreamWriter): + def __init__( + self, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + ) -> None: ... + + +class RecordBatchFileReader(lib._RecordBatchFileReader): + def __init__( + self, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + + +class RecordBatchFileWriter(lib._RecordBatchFileWriter): + def __init__( + self, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + ) -> None: ... + + +def new_stream( + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, +) -> RecordBatchStreamWriter: ... + + +def open_stream( + source: bytes | int | lib.Buffer | lib.NativeFile | IOBase, + *, + options: Any = None, + memory_pool: lib.MemoryPool | None = None, +) -> RecordBatchStreamReader: ... + + +def new_file( + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + metadata: lib.KeyValueMetadata | dict[bytes, bytes] | None = None, +) -> RecordBatchFileWriter: ... + + +def open_file( + source: StrPath | bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, + *, + options: Any = None, + memory_pool: lib.MemoryPool | None = None, +) -> RecordBatchFileReader: ... + + +def serialize_pandas( + df: pd.DataFrame, *, nthreads: int | None = None, preserve_index: bool | None = None +) -> lib.Buffer: ... + + +def deserialize_pandas( + buf: lib.Buffer, *, use_threads: bool = True) -> pd.DataFrame: ... + + +__all__ = [ + "Alignment", + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "MetadataVersion", + "ReadStats", + "RecordBatchReader", + "WriteStats", + "_ReadPandasMixin", + "get_record_batch_size", + "get_tensor_size", + "read_message", + "read_record_batch", + "read_schema", + "read_tensor", + "write_tensor", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "new_stream", + "open_stream", + "new_file", + "open_file", + "serialize_pandas", + "deserialize_pandas", +] diff --git a/python/pyarrow-stubs/pyarrow/json.pyi b/python/pyarrow-stubs/pyarrow/json.pyi new file mode 100644 index 000000000000..67768db42e43 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/json.pyi @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json + +__all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git a/python/pyarrow-stubs/pyarrow/orc.pyi b/python/pyarrow-stubs/pyarrow/orc.pyi new file mode 100644 index 000000000000..f16350d0ffc9 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/orc.pyi @@ -0,0 +1,146 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import IO, Any, Literal + +from _typeshed import StrPath + +from . import _orc +from ._fs import SupportedFileSystem +from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table + + +class ORCFile: + reader: _orc.ORCReader + def __init__(self, source: StrPath | NativeFile | IO) -> None: ... + @property + def metadata(self) -> KeyValueMetadata: ... + + @property + def schema(self) -> Schema: ... + + @property + def nrows(self) -> int: ... + + @property + def nstripes(self) -> int: ... + + @property + def file_version(self) -> str: ... + + @property + def software_version(self) -> str: ... + + @property + def compression(self) -> Literal["UNCOMPRESSED", + "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... + + @property + def compression_size(self) -> int: ... + + @property + def writer(self) -> str: ... + + @property + def writer_version(self) -> str: ... + + @property + def row_index_stride(self) -> int: ... + + @property + def nstripe_statistics(self) -> int: ... + + @property + def content_length(self) -> int: ... + + @property + def stripe_statistics_length(self) -> int: ... + + @property + def file_footer_length(self) -> int: ... + + @property + def file_postscript_length(self) -> int: ... + + @property + def file_length(self) -> int: ... + + def read_stripe( + self, n: int, columns: list[str | int] | None = None + ) -> RecordBatch: ... + + def read(self, columns: list[str | int] | None = None) -> Table: ... + + +class ORCWriter: + writer: _orc.ORCWriter + is_open: bool + + def __init__( + self, + where: StrPath | NativeFile | IO, + *, + file_version: Any = "0.12", + batch_size: Any = 1024, + stripe_size: Any = 64 * 1024 * 1024, # noqa: Y011 + compression: Any = "UNCOMPRESSED", + compression_block_size: Any = 65536, + compression_strategy: Any = "SPEED", + row_index_stride: Any = 10000, + padding_tolerance: Any = 0.0, + dictionary_key_size_threshold: Any = 0.0, + bloom_filter_columns: Any = None, + bloom_filter_fpp: Any = 0.05, + ): ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> None: ... + def __getattr__(self, name: str) -> Any: ... + def write(self, table: Table) -> None: ... + + def close(self) -> None: ... + + +def read_table( + source: StrPath | NativeFile | IO, + columns: list[str | int] | None = None, + filesystem: SupportedFileSystem | str | None = None, +) -> Table: ... + + +# TODO: should not use Any here? +def write_table( + table: Table, + where: StrPath | NativeFile | IO, + *, + file_version: Any = "0.12", + batch_size: Any = 1024, + stripe_size: Any = 64 * 1024 * 1024, # noqa: Y011 + compression: Any = 'UNCOMPRESSED', + compression_block_size: Any = 65536, + compression_strategy: Any = 'SPEED', + row_index_stride: Any = 10000, + padding_tolerance: Any = 0.0, + dictionary_key_size_threshold: Any = 0.0, + bloom_filter_columns: Any = None, + bloom_filter_fpp: Any = 0.05, +) -> None: ... diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index 241c27706a6f..4b0ecb9f18e0 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -183,6 +183,7 @@ def write_feather(df, dest, compression=None, compression_level=None, f'one of {_FEATHER_SUPPORTED_CODECS}') try: + assert version in (1, 2) _feather.write_feather(table, dest, compression=compression, compression_level=compression_level, chunksize=chunksize, version=version) @@ -269,7 +270,7 @@ def read_table(source, columns=None, memory_map=False, use_threads=True): f"Got columns {columns} of types {column_type_names}") # Feather v1 already respects the column selection - if reader.version < 3: + if int(reader.version) < 3: return table # Feather v2 reads with sorted / deduplicated selection elif sorted(set(columns)) == columns: diff --git a/python/pyarrow/orc.py b/python/pyarrow/orc.py index 4e0d66ec6659..222c289c8793 100644 --- a/python/pyarrow/orc.py +++ b/python/pyarrow/orc.py @@ -20,7 +20,7 @@ import warnings from pyarrow.lib import Table -import pyarrow._orc as _orc +import pyarrow._orc as _orc # type: ignore[reportMissingModuleSource] from pyarrow.fs import _resolve_filesystem_and_path @@ -255,9 +255,11 @@ def __init__(self, where, *, file_version=file_version, batch_size=batch_size, stripe_size=stripe_size, - compression=compression, + compression=compression, # type: ignore[reportArgumentType] compression_block_size=compression_block_size, - compression_strategy=compression_strategy, + compression_strategy=( + compression_strategy # type: ignore[reportArgumentType] + ), row_index_stride=row_index_stride, padding_tolerance=padding_tolerance, dictionary_key_size_threshold=dictionary_key_size_threshold, diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index dce605c7156d..2c271fa9b1bf 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -178,6 +178,7 @@ def test_read_options(pickle_module): encoding='utf16', skip_rows_after_names=27) + assert opts.block_size is not None assert opts.block_size > 0 opts.block_size = 12345 assert opts.block_size == 12345 @@ -302,6 +303,7 @@ def test_convert_options(pickle_module): with pytest.raises(ValueError): opts.decimal_point = '..' + assert opts.auto_dict_max_cardinality is not None assert opts.auto_dict_max_cardinality > 0 opts.auto_dict_max_cardinality = 99999 assert opts.auto_dict_max_cardinality == 99999 @@ -323,7 +325,7 @@ def test_convert_options(pickle_module): with pytest.raises(TypeError, match='DataType expected'): opts.column_types = {'a': None} with pytest.raises(TypeError): - opts.column_types = 0 + opts.column_types = 0 # type: ignore[reportAttributeAccessIssue] assert isinstance(opts.null_values, list) assert '' in opts.null_values @@ -1158,10 +1160,14 @@ def test_auto_dict_encode(self): table = self.read_bytes(rows, convert_options=opts, validate_full=False) assert table.schema == schema - dict_values = table['a'].chunk(0).dictionary + column_chunk = table.column('a').chunk(0) + assert isinstance(column_chunk, pa.DictionaryArray) + dict_values = column_chunk.dictionary assert len(dict_values) == 2 assert dict_values[0].as_py() == "ab" - assert dict_values[1].as_buffer() == b"cd\xff" + dict_value = dict_values[1] + assert isinstance(dict_value, pa.StringScalar) + assert dict_value.as_buffer() == b"cd\xff" # With invalid UTF8, checked opts.check_utf8 = True @@ -1502,7 +1508,7 @@ def signal_from_thread(): # Interruption should have arrived timely assert last_duration <= 2.0 - e = exc_info.__context__ + e = exc_info.__context__ # type: ignore[possibly-missing-attribute, misc] assert isinstance(e, pa.ArrowCancelled) assert e.signum == signal.SIGINT @@ -1866,6 +1872,9 @@ def use_threads(self): class BaseTestCompressedCSVRead: + def write_file(self, path, contents): + pass + csv_filename = "" def setUp(self): self.tmpdir = tempfile.mkdtemp(prefix='arrow-csv-test-') @@ -1997,7 +2006,7 @@ def test_write_quoting_style(): except Exception as e: # This will trigger when we try to write a comma (,) # without quotes, which is invalid - assert isinstance(e, res) + assert isinstance(e, res) # type: ignore[invalid-argument-type] break assert buf.getvalue() == res buf.seek(0) diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 054bf920b269..a84b343b3dd2 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -26,7 +26,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.tests.strategies as past @@ -47,7 +47,7 @@ def datadir(base_datadir): def random_path(prefix='feather_'): - return tempfile.mktemp(prefix=prefix) + return tempfile.mktemp(prefix=prefix) # type: ignore[deprecated] @pytest.fixture(scope="module", params=[1, 2]) @@ -63,7 +63,7 @@ def compression(request): yield request.param -TEST_FILES = None +TEST_FILES: list[str] | None = None def setup_module(module): @@ -72,7 +72,7 @@ def setup_module(module): def teardown_module(module): - for path in TEST_FILES: + for path in TEST_FILES: # type: ignore[union-attr] try: os.remove(path) except os.error: @@ -95,6 +95,7 @@ def _check_pandas_roundtrip(df, expected=None, path=None, if version is None: version = 2 + assert TEST_FILES is not None TEST_FILES.append(path) write_feather(df, path, compression=compression, compression_level=compression_level, version=version) @@ -114,6 +115,7 @@ def _check_arrow_roundtrip(table, path=None, compression=None): if path is None: path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) write_feather(table, path, compression=compression) if not os.path.exists(path): @@ -126,10 +128,12 @@ def _check_arrow_roundtrip(table, path=None, compression=None): def _assert_error_on_write(df, exc, path=None, version=2): # check that we are raising the exception # on writing + assert version in (1, 2) if path is None: path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) def f(): @@ -149,6 +153,7 @@ def test_dataset(version): } table = pa.table(data) + assert TEST_FILES is not None TEST_FILES.extend(paths) for index, path in enumerate(paths): rows = ( @@ -156,7 +161,8 @@ def test_dataset(version): (index + 1) * (num_values[0] // num_files), ) - write_feather(table[rows[0]: rows[1]], path, version=version) + write_feather(table[rows[0]: rows[1]], path, + version=version) # type: ignore[arg-type] data = FeatherDataset(paths).read_table() assert data.equals(table) @@ -181,6 +187,7 @@ def test_read_table(version): num_values = (100, 100) path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) values = np.random.randint(0, 100, size=num_values) @@ -206,6 +213,7 @@ def test_use_threads(version): num_values = (10, 10) path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) values = np.random.randint(0, 10, size=num_values) @@ -231,6 +239,7 @@ def test_float_nulls(version): num_values = 100 path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) null_mask = np.random.randint(0, 10, size=num_values) < 3 @@ -292,6 +301,7 @@ def test_platform_numpy_integers(version): def test_integer_with_nulls(version): # pandas requires upcast to float dtype path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] @@ -330,6 +340,7 @@ def test_boolean_no_nulls(version): def test_boolean_nulls(version): # pandas requires upcast to object dtype path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) num_values = 100 @@ -348,6 +359,7 @@ def test_boolean_nulls(version): def test_buffer_bounds_error(version): # ARROW-1676 path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) for i in range(16, 256): @@ -360,6 +372,7 @@ def test_buffer_bounds_error(version): @pytest.mark.numpy def test_boolean_object_nulls(version): + assert np is not None repeats = 100 table = pa.Table.from_arrays( [np.array([False, None, True] * repeats, dtype=object)], @@ -426,7 +439,8 @@ def test_empty_strings(version): @pytest.mark.pandas def test_all_none(version): df = pd.DataFrame({'all_none': [None] * 10}) - if version == 1 and pa.pandas_compat._pandas_api.uses_string_dtype(): + if (version == 1 and pa.pandas_compat # type: ignore[attr-defined] + ._pandas_api.uses_string_dtype()): expected = df.astype("str") else: expected = df @@ -552,6 +566,7 @@ def test_read_columns(version): @pytest.mark.numpy def test_overwritten_file(version): path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) num_values = 100 @@ -585,12 +600,12 @@ def test_filelike_objects(version): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") def test_sparse_dataframe(version): - if not pa.pandas_compat._pandas_api.has_sparse: + if not pa.pandas_compat._pandas_api.has_sparse: # type: ignore[attr-defined] pytest.skip("version of pandas does not support SparseDataFrame") # GH #221 data = {'A': [0, 1, 2], 'B': [1, 0, 1]} - df = pd.DataFrame(data).to_sparse(fill_value=1) + df = pd.DataFrame(data).to_sparse(fill_value=1) # type: ignore[attr-defined] expected = df.to_dense() _check_pandas_roundtrip(df, expected, version=version) @@ -692,8 +707,9 @@ def test_v2_lz4_default_compression(): if not pa.Codec.is_available('lz4_frame'): pytest.skip("LZ4 compression support is not built in C++") + assert np is not None # some highly compressible data - t = pa.table([np.repeat(0, 100000)], names=['f0']) + t = pa.table([np.repeat(0, 100000)], names=['f0']) # type: ignore[arg-type] buf = io.BytesIO() write_feather(t, buf) diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index a6d3546e57c6..3837b553b8be 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -24,16 +24,17 @@ import math import os import pathlib -import pytest +import pytest # type: ignore[import-not-found] import random import sys import tempfile +from typing import cast import weakref try: import numpy as np except ImportError: - np = None + pass from pyarrow.util import guid from pyarrow import Codec @@ -44,7 +45,7 @@ def check_large_seeks(file_factory, expected_error=None): if sys.platform in ('win32', 'darwin', 'emscripten'): pytest.skip("need sparse file support") try: - filename = tempfile.mktemp(prefix='test_io') + filename = tempfile.mkstemp(prefix='test_io')[1] with open(filename, 'wb') as f: f.truncate(2 ** 32 + 10) f.seek(2 ** 32 + 5) @@ -234,7 +235,7 @@ def read_buffer(self, nbytes): return memoryview(dst_buf)[:nbytes] duck_reader = DuckReader() - with pa.PythonFile(duck_reader, mode='r') as f: + with pa.PythonFile(duck_reader, mode='r') as f: # type: ignore[arg-type] buf = f.read_buffer(length) assert len(buf) == length assert memoryview(buf).tobytes() == dst_buf[:length] @@ -474,7 +475,7 @@ def test_buffer_to_numpy(): byte_array = bytearray(20) byte_array[0] = 42 buf = pa.py_buffer(byte_array) - array = np.frombuffer(buf, dtype="uint8") + array = np.frombuffer(buf, dtype="uint8") # type: ignore[arg-type] assert array[0] == byte_array[0] byte_array[0] += 1 assert array[0] == byte_array[0] @@ -557,7 +558,7 @@ def test_buffer_eq_bytes(): assert buf != b'some dat1' with pytest.raises(TypeError): - buf == 'some data' + _ = buf == 'some data' def test_buffer_getitem(): @@ -598,22 +599,22 @@ def test_buffer_slicing(): with pytest.raises(IndexError): buf.slice(len(buf) + 1) - assert buf[11:].to_pybytes() == b"" + assert cast(pa.Buffer, buf[11:]).to_pybytes() == b"" # Slice stop exceeds buffer length with pytest.raises(IndexError): buf.slice(1, len(buf)) - assert buf[1:11].to_pybytes() == buf.to_pybytes()[1:] + assert cast(pa.Buffer, buf[1:11]).to_pybytes() == buf.to_pybytes()[1:] # Negative length with pytest.raises(IndexError): buf.slice(1, -1) # Test slice notation - assert buf[2:].equals(buf.slice(2)) - assert buf[2:5].equals(buf.slice(2, 3)) - assert buf[-5:].equals(buf.slice(len(buf) - 5)) - assert buf[-5:-2].equals(buf.slice(len(buf) - 5, 3)) + assert cast(pa.Buffer, buf[2:]).equals(buf.slice(2)) + assert cast(pa.Buffer, buf[2:5]).equals(buf.slice(2, 3)) + assert cast(pa.Buffer, buf[-5:]).equals(buf.slice(len(buf) - 5)) + assert cast(pa.Buffer, buf[-5:-2]).equals(buf.slice(len(buf) - 5, 3)) with pytest.raises(IndexError): buf[::-1] @@ -623,7 +624,8 @@ def test_buffer_slicing(): n = len(buf) for start in range(-n * 2, n * 2): for stop in range(-n * 2, n * 2): - assert buf[start:stop].to_pybytes() == buf.to_pybytes()[start:stop] + assert cast(pa.Buffer, buf[start:stop]).to_pybytes( + ) == buf.to_pybytes()[start:stop] def test_buffer_hashing(): @@ -640,7 +642,7 @@ def test_buffer_protocol_respects_immutability(): # immutable a = b'12345' arrow_ref = pa.py_buffer(a) - numpy_ref = np.frombuffer(arrow_ref, dtype=np.uint8) + numpy_ref = np.frombuffer(arrow_ref, dtype=np.uint8) # type: ignore[arg-type] assert not numpy_ref.flags.writeable @@ -652,7 +654,8 @@ def test_foreign_buffer(): buf = pa.foreign_buffer(addr, size, obj) wr = weakref.ref(obj) del obj - assert np.frombuffer(buf, dtype=np.int32).tolist() == [1, 2] + assert (np.frombuffer(buf, dtype=np.int32).tolist() # type: ignore[arg-type] + == [1, 2]) assert wr() is not None del buf assert wr() is None @@ -688,6 +691,7 @@ def test_non_cpu_buffer(pickle_module): cuda_buf = ctx.buffer_from_data(data) arr = pa.FixedSizeBinaryArray.from_buffers(pa.binary(7), 1, [None, cuda_buf]) buf_on_gpu = arr.buffers()[1] + assert buf_on_gpu is not None assert buf_on_gpu.size == cuda_buf.size assert buf_on_gpu.address == cuda_buf.address @@ -708,7 +712,7 @@ def test_non_cpu_buffer(pickle_module): assert cuda_sliced.to_pybytes() == b'st' # Sliced buffers with same address - assert buf_on_gpu_sliced.equals(cuda_buf[2:4]) + assert cast(pa.Buffer, buf_on_gpu_sliced).equals(cuda_buf[2:4]) # Buffers on different devices msg_device = "Device on which the data resides differs between buffers" @@ -720,13 +724,14 @@ def test_non_cpu_buffer(pickle_module): arr_short = np.array([b'sting']) cuda_buf_short = ctx.buffer_from_data(arr_short) with pytest.raises(NotImplementedError, match=msg): - buf_on_gpu_sliced.equals(cuda_buf_short) + cast(pa.Buffer, buf_on_gpu_sliced).equals(cuda_buf_short) arr_short = pa.FixedSizeBinaryArray.from_buffers( pa.binary(5), 1, [None, cuda_buf_short] ) buf_on_gpu_short = arr_short.buffers()[1] + assert buf_on_gpu_short is not None with pytest.raises(NotImplementedError, match=msg): - buf_on_gpu_sliced.equals(buf_on_gpu_short) + cast(pa.Buffer, buf_on_gpu_sliced).equals(buf_on_gpu_short) with pytest.raises(NotImplementedError, match=msg): buf_on_gpu.hex() @@ -811,8 +816,9 @@ def test_cache_options_pickling(pickle_module): @pytest.mark.numpy @pytest.mark.parametrize("compression", [ - pytest.param( - "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + pytest.param("bz2", marks=pytest.mark.xfail( + raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined] + ) ), "brotli", "gzip", @@ -843,6 +849,7 @@ def test_compress_decompress(compression): assert isinstance(decompressed_bytes, bytes) + assert isinstance(decompressed_buf, pa.Buffer) assert decompressed_buf.equals(test_buf) assert decompressed_bytes == test_data @@ -852,8 +859,9 @@ def test_compress_decompress(compression): @pytest.mark.numpy @pytest.mark.parametrize("compression", [ - pytest.param( - "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + pytest.param("bz2", marks=pytest.mark.xfail( + raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined] + ) ), "brotli", "gzip", @@ -910,6 +918,7 @@ def test_compression_level(compression): assert isinstance(decompressed_bytes, bytes) + assert isinstance(decompressed_buf, pa.Buffer) assert decompressed_buf.equals(test_buf) assert decompressed_bytes == test_data @@ -951,12 +960,12 @@ def test_buffer_memoryview_is_immutable(): assert result.readonly with pytest.raises(TypeError) as exc: - result[0] = b'h' + result[0] = b'h' # type: ignore[index] assert 'cannot modify read-only' in str(exc.value) b = bytes(buf) with pytest.raises(TypeError) as exc: - b[0] = b'h' + b[0] = b'h' # type: ignore[index] assert 'cannot modify read-only' in str(exc.value) @@ -1748,9 +1757,9 @@ def test_unknown_compression_raises(): "gzip", "lz4", "zstd", - pytest.param( - "snappy", - marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + pytest.param("snappy", marks=pytest.mark.xfail( + raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined] + ) ) ]) def test_compressed_roundtrip(compression): @@ -2021,7 +2030,7 @@ def test_input_stream_native_file(): def test_input_stream_errors(tmpdir): buf = memoryview(b"") with pytest.raises(ValueError): - pa.input_stream(buf, compression="foo") + pa.input_stream(buf, compression="foo") # type: ignore[reportArgumentType] for arg in [bytearray(), StringIO()]: with pytest.raises(TypeError): @@ -2198,7 +2207,7 @@ def check_data(data, **kwargs): def test_output_stream_errors(tmpdir): buf = memoryview(bytearray()) with pytest.raises(ValueError): - pa.output_stream(buf, compression="foo") + pa.output_stream(buf, compression="foo") # type: ignore[reportArgumentType] for arg in [bytearray(), StringIO()]: with pytest.raises(TypeError): diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index 6813ed777234..93b9e7f1aa02 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -24,23 +24,27 @@ import socket import threading import weakref +from typing import TYPE_CHECKING, cast -try: +if TYPE_CHECKING: import numpy as np -except ImportError: - np = None + import pandas as pd + from pandas.testing import assert_frame_equal +else: + try: + import numpy as np + except ImportError: + pass + try: + from pandas.testing import assert_frame_equal + import pandas as pd + except ImportError: + pass import pyarrow as pa from pyarrow.tests.util import changed_environ, invoke_script -try: - from pandas.testing import assert_frame_equal - import pandas as pd -except ImportError: - pass - - class IpcFixture: write_stats = None @@ -48,6 +52,9 @@ def __init__(self, sink_factory=lambda: io.BytesIO()): self._sink_factory = sink_factory self.sink = self.get_sink() + def _get_writer(self, sink, schema): + ... # Implemented in subclasses + def get_sink(self): return self._sink_factory() @@ -59,6 +66,7 @@ def write_batches(self, num_batches=5, as_table=False): schema = pa.schema([('one', pa.float64()), ('two', pa.utf8())]) writer = self._get_writer(self.sink, schema) + assert writer is not None batches = [] for i in range(num_batches): @@ -385,7 +393,8 @@ def test_stream_write_table_batches(stream_fixture): 'one': np.random.randn(20), }) - b1 = pa.RecordBatch.from_pandas(df[:10], preserve_index=False) + b1 = pa.RecordBatch.from_pandas( + df[:10], preserve_index=False) # type: ignore[arg-type] b2 = pa.RecordBatch.from_pandas(df, preserve_index=False) table = pa.Table.from_batches([b1, b2, b1]) @@ -941,7 +950,7 @@ def test_ipc_file_stream_has_eos(): buffer = sink.getvalue() # skip the file magic - reader = pa.ipc.open_stream(buffer[8:]) + reader = pa.ipc.open_stream(cast(pa.Buffer, buffer[8:])) # will fail if encounters footer data instead of eos rdf = reader.read_pandas() @@ -980,7 +989,8 @@ def test_batches_with_custom_metadata_roundtrip(ipc_type): with file_factory(sink, batch.schema) as writer: for i in range(batch_count): - writer.write_batch(batch, custom_metadata={"batch_id": str(i)}) + writer.write_batch(batch, custom_metadata={ # type: ignore[arg-type] + "batch_id": str(i)}) # write a batch without custom metadata writer.write_batch(batch) diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py index c3f9fe333bd0..c0b6b8ecd0d5 100644 --- a/python/pyarrow/tests/test_json.py +++ b/python/pyarrow/tests/test_json.py @@ -23,11 +23,16 @@ import json import string import unittest +from typing import TYPE_CHECKING -try: +if TYPE_CHECKING: import numpy as np -except ImportError: - np = None +else: + try: + import numpy as np + except ImportError: + pass + import pytest import pyarrow as pa @@ -317,6 +322,9 @@ def test_stress_block_sizes(self): class BaseTestJSONRead(BaseTestJSON): + def read_json(self, *args, **kwargs) -> pa.Table: # type: ignore[empty-body] + ... # Implemented in subclasses + def read_bytes(self, b, **kwargs): return self.read_json(pa.py_buffer(b), **kwargs) @@ -352,6 +360,8 @@ def test_reconcile_across_blocks(self): class BaseTestStreamingJSONRead(BaseTestJSON): + use_threads: bool = False # Set by subclasses + def open_json(self, json, *args, **kwargs): """ Reads the JSON file into memory using pyarrow's open_json diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index 27154a6f34f3..d0e61d758cb2 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -77,7 +77,7 @@ def fix_example_values(actual_cols, expected_cols): if not pd.isnull(v): exp = d.as_tuple().exponent factor = 10 ** -exp - converted_decimals[i] = ( + converted_decimals[i] = ( # type: ignore[call-overload,assignment] decimal.Decimal(round(v * factor)).scaleb(exp)) expected = pd.Series(converted_decimals) @@ -314,7 +314,7 @@ def test_buffer_readwrite(): # deprecated keyword order buffer_output_stream = pa.BufferOutputStream() with pytest.warns(FutureWarning): - orc.write_table(buffer_output_stream, table) + orc.write_table(buffer_output_stream, table) # type: ignore[arg-type] buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) orc_file = orc.ORCFile(buffer_reader) output_table = orc_file.read() @@ -356,8 +356,8 @@ def test_buffer_readwrite_with_writeoptions(): buffer_output_stream = pa.BufferOutputStream() with pytest.warns(FutureWarning): orc.write_table( - buffer_output_stream, - table, + buffer_output_stream, # type: ignore[reportArgumentType] + table, # type: ignore[reportArgumentType] compression='uncompressed', file_version='0.11', row_index_stride=20000, @@ -444,20 +444,20 @@ def test_buffer_readwrite_with_bad_writeoptions(): orc.write_table( table, buffer_output_stream, - compression=0, + compression=0, # type: ignore[reportArgumentType] ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, - compression='none', + compression='none', # type: ignore[reportArgumentType] ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, - compression='zlid', + compression='zlid', # type: ignore[reportArgumentType] ) # compression_block_size must be a positive integer @@ -487,20 +487,20 @@ def test_buffer_readwrite_with_bad_writeoptions(): orc.write_table( table, buffer_output_stream, - compression_strategy=0, + compression_strategy=0, # type: ignore[reportArgumentType] ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, - compression_strategy='no', + compression_strategy='no', # type: ignore[reportArgumentType] ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, - compression_strategy='large', + compression_strategy='large', # type: ignore[reportArgumentType] ) # row_index_stride must be a positive integer