diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index bc7fe3cd6830..b0d526b1ee04 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -239,6 +239,11 @@ jobs: - name: Test shell: bash run: ci/scripts/python_test.sh $(pwd) $(pwd)/build + - name: Test annotations + shell: bash + env: + PYARROW_TEST_ANNOTATIONS: "ON" + run: ci/scripts/python_test_type_annotations.sh $(pwd)/python windows: name: AMD64 Windows 2022 Python 3.13 @@ -296,3 +301,7 @@ jobs: shell: cmd run: | call "ci\scripts\python_test.bat" %cd% + - name: Test annotations + shell: cmd + run: | + call "ci\scripts\python_test_type_annotations.bat" %cd%\python diff --git a/ci/scripts/python_test_type_annotations.bat b/ci/scripts/python_test_type_annotations.bat new file mode 100644 index 000000000000..3446e329a899 --- /dev/null +++ b/ci/scripts/python_test_type_annotations.bat @@ -0,0 +1,38 @@ +@rem Licensed to the Apache Software Foundation (ASF) under one +@rem or more contributor license agreements. See the NOTICE file +@rem distributed with this work for additional information +@rem regarding copyright ownership. The ASF licenses this file +@rem to you under the Apache License, Version 2.0 (the +@rem "License"); you may not use this file except in compliance +@rem with the License. You may obtain a copy of the License at +@rem +@rem http://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, +@rem software distributed under the License is distributed on an +@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@rem KIND, either express or implied. See the License for the +@rem specific language governing permissions and limitations +@rem under the License. + +@echo on + +set PYARROW_DIR=%1 + +echo Annotation testing on Windows ... + +@REM Install library stubs +%PYTHON_CMD% -m pip install pandas-stubs scipy-stubs sphinx types-cffi types-psutil types-requests types-python-dateutil || exit /B 1 + +@REM Install other dependencies for type checking +%PYTHON_CMD% -m pip install fsspec || exit /B 1 + +@REM Install type checkers +%PYTHON_CMD% -m pip install mypy pyright ty || exit /B 1 + +@REM Run type checkers +pushd %PYARROW_DIR% + +mypy +pyright +ty check diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index fc256d72785c..2021e2d41d38 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -135,6 +135,11 @@ pushd C:\arrow\python @REM Build wheel %PYTHON_CMD% -m build --sdist --wheel . --no-isolation || exit /B 1 +@REM We first populate stub docstrings and then build the wheel +%PYTHON_CMD% setup.py build_ext --inplace +%PYTHON_CMD% -m pip install griffe libcst +%PYTHON_CMD% ..\dev\update_stub_docstrings.py pyarrow-stubs + @REM Repair the wheel with delvewheel @REM @REM Since we bundled the Arrow C++ libraries ourselves, we only need to diff --git a/compose.yaml b/compose.yaml index c799059fe254..87b79300011a 100644 --- a/compose.yaml +++ b/compose.yaml @@ -1539,8 +1539,7 @@ services: /arrow/ci/scripts/python_build.sh /arrow /build && pip install -e /arrow/dev/archery[numpydoc] && archery numpydoc --allow-rule GL10,PR01,PR03,PR04,PR05,PR10,RT03,YD01 && - /arrow/ci/scripts/python_test.sh /arrow && - /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"] + /arrow/ci/scripts/python_test.sh /arrow"] conda-python-dask: # Possible $DASK parameters: diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py new file mode 100644 index 000000000000..eaeb2a510eb5 --- /dev/null +++ b/dev/update_stub_docstrings.py @@ -0,0 +1,214 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Utility to extract docstrings from pyarrow and update +# docstrings in stubfiles. +# +# Usage +# ===== +# +# python ../dev/update_stub_docstrings.py pyarrow-stubs + + +from pathlib import Path +from textwrap import indent + +import click +# TODO: perhaps replace griffe with importlib +import griffe +from griffe import AliasResolutionError +import libcst +from libcst import matchers as m + + +def _get_docstring(name, package, indentation): + # print("extract_docstrings", name) + try: + obj = package.get_member(name) + except (KeyError, ValueError, AliasResolutionError): + # Some cython __init__ symbols can't be found + # e.g. pyarrow.lib.OSFile.__init__ + stack = name.split(".") + parent_name = ".".join(stack[:-1]) + + try: + obj = package.get_member(parent_name).all_members[stack[-1]] + except (KeyError, ValueError, AliasResolutionError): + print(f"{name} not found in {package.name}, it's probably ok.") + return None + + if obj.has_docstring: + docstring = obj.docstring.value + # Remove signature if present in docstring + if docstring.startswith(obj.name) or ( + (hasattr(obj.parent, "name") and + docstring.startswith(f"{obj.parent.name}.{obj.name}"))): + docstring = "\n".join(docstring.splitlines()[2:]) + # Skip empty docstrings + if docstring.strip() == "": + return None + # Indent docstring + indentation_prefix = indentation * " " + docstring = indent(docstring + '\n"""', indentation_prefix) + docstring = '"""\n' + docstring + return docstring + return None + + +class ReplaceEllipsis(libcst.CSTTransformer): + def __init__(self, package, namespace): + self.package = package + self.base_namespace = namespace + self.stack = [] + self.indentation = 0 + + # Insert module level docstring if _clone_signature is used + def leave_Module(self, original_node, updated_node): + new_body = [] + clone_matcher = m.SimpleStatementLine( + body=[m.Assign( + value=m.Call(func=m.Name(value="_clone_signature")) + ), m.ZeroOrMore()] + ) + for statement in updated_node.body: + new_body.append(statement) + if m.matches(statement, clone_matcher): + name = statement.body[0].targets[0].target.value + if self.base_namespace: + name = f"{self.base_namespace}.{name}" + docstring = _get_docstring(name, self.package, 0) + if docstring is not None: + new_expr = libcst.Expr(value=libcst.SimpleString(docstring)) + new_line = libcst.SimpleStatementLine(body=[new_expr]) + new_body.append(new_line) + + return updated_node.with_changes(body=new_body) + + def visit_ClassDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_ClassDef(self, original_node, updated_node): + name = ".".join(self.stack) + if self.base_namespace: + name = self.base_namespace + "." + name + + class_matcher_1 = m.ClassDef( + name=m.Name(), + body=m.IndentedBlock( + body=[m.SimpleStatementLine( + body=[m.Expr(m.Ellipsis()), m.ZeroOrMore()] + ), m.ZeroOrMore()] + ) + ) + class_matcher_2 = m.ClassDef( + name=m.Name(), + body=m.IndentedBlock( + body=[m.FunctionDef(), m.ZeroOrMore()] + ) + ) + + if m.matches(updated_node, class_matcher_1): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: + new_node = libcst.SimpleString(value=docstring) + updated_node = updated_node.deep_replace( + updated_node.body.body[0].body[0].value, new_node) + + if m.matches(updated_node, class_matcher_2): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: + new_docstring = libcst.SimpleString(value=docstring) + new_body = [ + libcst.SimpleWhitespace(self.indentation * " "), + libcst.Expr(value=new_docstring), + libcst.Newline() + ] + list(updated_node.body.body) + new_body = libcst.IndentedBlock(body=new_body) + updated_node = updated_node.with_changes(body=new_body) + + self.stack.pop() + self.indentation -= 1 + return updated_node + + def visit_FunctionDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_FunctionDef(self, original_node, updated_node): + name = ".".join(self.stack) + if self.base_namespace: + name = self.base_namespace + "." + name + + function_matcher = m.FunctionDef( + name=m.Name(), + body=m.SimpleStatementSuite( + body=[m.Expr( + m.Ellipsis() + )])) + if m.matches(original_node, function_matcher): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: + new_docstring = libcst.SimpleString(value=docstring) + new_body = [ + libcst.SimpleWhitespace(self.indentation * " "), + libcst.Expr(value=new_docstring), + libcst.Newline() + ] + new_body = libcst.IndentedBlock(body=new_body) + updated_node = updated_node.with_changes(body=new_body) + + self.stack.pop() + self.indentation -= 1 + return updated_node + + +@click.command() +@click.argument('pyarrow_folder', type=click.Path(resolve_path=True)) +def add_docs_to_stub_files(pyarrow_folder): + print("Updating docstrings of stub files in:", pyarrow_folder) + package = griffe.load("pyarrow", try_relative_path=True, + force_inspection=True, resolve_aliases=True) + lib_modules = ["array", "builder", "compat", "config", "device", "error", "io", + "_ipc", "memory", "pandas_shim", "scalar", "table", "tensor", + "_types"] + + for stub_file in Path(pyarrow_folder).rglob('*.pyi'): + if stub_file.name == "_stubs_typing.pyi": + continue + module = stub_file.with_suffix('').name + print(f"[{stub_file} {module}]") + + with open(stub_file, 'r') as f: + tree = libcst.parse_module(f.read()) + + if module in lib_modules: + module = "lib" + elif stub_file.parent.name in ["parquet", "interchange"]: + module = f"{stub_file.parent.name}.{module}" + elif module == "__init__": + module = "" + + modified_tree = tree.visit(ReplaceEllipsis(package, module)) + with open(stub_file, "w") as f: + f.write(modified_tree.code) + print("\n") + + +if __name__ == "__main__": + docstrings_map = {} + add_docs_to_stub_files(obj={}) diff --git a/docs/source/developers/python/development.rst b/docs/source/developers/python/development.rst index 5529ad25a294..2e2413522439 100644 --- a/docs/source/developers/python/development.rst +++ b/docs/source/developers/python/development.rst @@ -42,7 +42,7 @@ Unit Testing ============ We are using `pytest `_ to develop our unit -test suite. After `building the project `_ you can run its unit tests +test suite. After `building the project `_ you can run its unit tests like so: .. code-block:: diff --git a/python/pyarrow-stubs/pyarrow/__init__.pyi b/python/pyarrow-stubs/pyarrow/__init__.pyi index ccec8d5abc07..a38ddaa6fe3e 100644 --- a/python/pyarrow-stubs/pyarrow/__init__.pyi +++ b/python/pyarrow-stubs/pyarrow/__init__.pyi @@ -15,15 +15,682 @@ # specific language governing permissions and limitations # under the License. -"""Type stubs for PyArrow. +from typing import Any +import pyarrow.lib as _lib -This is a placeholder stub file. -Complete type annotations will be added in subsequent PRs. -""" +from pyarrow.lib import ( + BuildInfo, + CppBuildInfo, + RuntimeInfo, + set_timezone_db_path, + MonthDayNano, + VersionInfo, + build_info, + cpp_build_info, + cpp_version, + cpp_version_info, + runtime_info, + cpu_count, + set_cpu_count, + enable_signal_handlers, + io_thread_count, + set_io_thread_count, +) + +from pyarrow.lib import ( + null, + bool_, + int8, + int16, + int32, + int64, + uint8, + uint16, + uint32, + uint64, + time32, + time64, + timestamp, + date32, + date64, + duration, + month_day_nano_interval, + float16, + float32, + float64, + binary, + string, + utf8, + binary_view, + string_view, + large_binary, + large_string, + large_utf8, + decimal32, + decimal64, + decimal128, + decimal256, + list_, + large_list, + list_view, + large_list_view, + map_, + struct, + union, + sparse_union, + dense_union, + dictionary, + run_end_encoded, + json_, + uuid, + fixed_shape_tensor, + bool8, + opaque, + field, + type_for_alias, + DataType, + DictionaryType, + StructType, + ListType, + LargeListType, + FixedSizeListType, + ListViewType, + LargeListViewType, + MapType, + UnionType, + SparseUnionType, + DenseUnionType, + TimestampType, + Time32Type, + Time64Type, + DurationType, + FixedSizeBinaryType, + Decimal32Type, + Decimal64Type, + Decimal128Type, + Decimal256Type, + BaseExtensionType, + ExtensionType, + RunEndEncodedType, + FixedShapeTensorType, + Bool8Type, + UuidType, + JsonType, + OpaqueType, + UnknownExtensionType, + register_extension_type, + unregister_extension_type, + DictionaryMemo, + KeyValueMetadata, + Field, + Schema, + schema, + unify_schemas, + Array, + Tensor, + array, + arange, + chunked_array, + record_batch, + nulls, + repeat, + SparseCOOTensor, + SparseCSRMatrix, + SparseCSCMatrix, + SparseCSFTensor, + infer_type, + from_numpy_dtype, + NullArray, + NumericArray, + IntegerArray, + FloatingPointArray, + BooleanArray, + Int8Array, + UInt8Array, + Int16Array, + UInt16Array, + Int32Array, + UInt32Array, + Int64Array, + UInt64Array, + HalfFloatArray, + FloatArray, + DoubleArray, + ListArray, + LargeListArray, + FixedSizeListArray, + ListViewArray, + LargeListViewArray, + MapArray, + UnionArray, + BinaryArray, + StringArray, + LargeBinaryArray, + LargeStringArray, + BinaryViewArray, + StringViewArray, + FixedSizeBinaryArray, + DictionaryArray, + Date32Array, + Date64Array, + TimestampArray, + Time32Array, + Time64Array, + DurationArray, + MonthDayNanoIntervalArray, + Decimal32Array, + Decimal64Array, + Decimal128Array, + Decimal256Array, + StructArray, + ExtensionArray, + RunEndEncodedArray, + FixedShapeTensorArray, + Bool8Array, + UuidArray, + JsonArray, + OpaqueArray, + scalar, + NA, + _NULL as NULL, + Scalar, + NullScalar, + BooleanScalar, + Int8Scalar, + Int16Scalar, + Int32Scalar, + Int64Scalar, + UInt8Scalar, + UInt16Scalar, + UInt32Scalar, + UInt64Scalar, + HalfFloatScalar, + FloatScalar, + DoubleScalar, + Decimal32Scalar, + Decimal64Scalar, + Decimal128Scalar, + Decimal256Scalar, + ListScalar, + LargeListScalar, + FixedSizeListScalar, + ListViewScalar, + LargeListViewScalar, + Date32Scalar, + Date64Scalar, + Time32Scalar, + Time64Scalar, + TimestampScalar, + DurationScalar, + MonthDayNanoIntervalScalar, + BinaryScalar, + LargeBinaryScalar, + BinaryViewScalar, + StringScalar, + LargeStringScalar, + StringViewScalar, + FixedSizeBinaryScalar, + DictionaryScalar, + MapScalar, + StructScalar, + UnionScalar, + RunEndEncodedScalar, + ExtensionScalar, + Bool8Scalar, + UuidScalar, + JsonScalar, + OpaqueScalar, +) + + +# Buffers, allocation +from pyarrow.lib import ( + DeviceAllocationType, + Device, + MemoryManager, + default_cpu_memory_manager +) + +from pyarrow.lib import ( + Buffer, + ResizableBuffer, + foreign_buffer, + py_buffer, + Codec, + compress, + decompress, + allocate_buffer, +) + +from pyarrow.lib import ( + MemoryPool, + LoggingMemoryPool, + ProxyMemoryPool, + total_allocated_bytes, + set_memory_pool, + default_memory_pool, + system_memory_pool, + jemalloc_memory_pool, + mimalloc_memory_pool, + logging_memory_pool, + proxy_memory_pool, + log_memory_allocations, + jemalloc_set_decay_ms, + supported_memory_backends, +) + +# I/O +from pyarrow.lib import ( + NativeFile, + PythonFile, + BufferedInputStream, + BufferedOutputStream, + CacheOptions, + CompressedInputStream, + CompressedOutputStream, + TransformInputStream, + transcoding_input_stream, + FixedSizeBufferWriter, + BufferReader, + BufferOutputStream, + OSFile, + MemoryMappedFile, + memory_map, + create_memory_map, + MockOutputStream, + input_stream, + output_stream, + have_libhdfs, +) + +from pyarrow.lib import ( + ChunkedArray, + RecordBatch, + Table, + table, + concat_arrays, + concat_batches, + concat_tables, + TableGroupBy, + RecordBatchReader, +) + +# Exceptions +from pyarrow.lib import ( + ArrowCancelled, + ArrowCapacityError, + ArrowException, + ArrowKeyError, + ArrowIndexError, + ArrowInvalid, + ArrowIOError, + ArrowMemoryError, + ArrowNotImplementedError, + ArrowTypeError, + ArrowSerializationError, +) + +from pyarrow.ipc import serialize_pandas, deserialize_pandas +import pyarrow.ipc as ipc +import pyarrow.lib as lib +import pyarrow.types as types +import pyarrow.feather as feather +import pyarrow.compute as compute +import pyarrow.csv as csv +import pyarrow.json as json +import pyarrow.dataset as dataset + +# ---------------------------------------------------------------------- +# Deprecations + +from pyarrow.util import _deprecate_api, _deprecate_class + +from pyarrow.ipc import ( + Message, + MessageReader, + MetadataVersion, + RecordBatchFileReader, + RecordBatchFileWriter, + RecordBatchStreamReader, + RecordBatchStreamWriter, +) + + +__version__: str +_gc_enabled: bool + + +def show_versions() -> None: ... +def show_info() -> None: ... +def _module_is_available(module: str) -> bool: ... +def _filesystem_is_available(fs: str) -> bool: ... + + +def get_include() -> str: ... +def _get_pkg_config_executable() -> str: ... +def _has_pkg_config(pkgname: str) -> bool: ... +def _read_pkg_config_variable(pkgname: str, cli_args: list[str]) -> str: ... +def get_libraries() -> list[str]: ... +def create_library_symlinks() -> None: ... +def get_library_dirs() -> list[str]: ... -from typing import Any -# TODO(GH-48970): remove __getattr__ before release as this -# will annotate non-existing attributes as Any. -# https://github.com/apache/arrow/issues/48970 -def __getattr__(name: str) -> Any: ... +__all__ = [ + "__version__", + "_lib", + "_gc_enabled", + "BuildInfo", + "CppBuildInfo", + "RuntimeInfo", + "set_timezone_db_path", + "MonthDayNano", + "VersionInfo", + "build_info", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "cpu_count", + "set_cpu_count", + "enable_signal_handlers", + "io_thread_count", + "set_io_thread_count", + "show_versions", + "show_info", + "_module_is_available", + "_filesystem_is_available", + "null", + "bool_", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "time32", + "time64", + "timestamp", + "date32", + "date64", + "duration", + "month_day_nano_interval", + "float16", + "float32", + "float64", + "binary", + "string", + "utf8", + "binary_view", + "string_view", + "large_binary", + "large_string", + "large_utf8", + "decimal32", + "decimal64", + "decimal128", + "decimal256", + "list_", + "large_list", + "list_view", + "large_list_view", + "map_", + "struct", + "union", + "sparse_union", + "dense_union", + "dictionary", + "run_end_encoded", + "json_", + "uuid", + "fixed_shape_tensor", + "bool8", + "opaque", + "field", + "type_for_alias", + "DataType", + "DictionaryType", + "StructType", + "ListType", + "LargeListType", + "FixedSizeListType", + "ListViewType", + "LargeListViewType", + "MapType", + "UnionType", + "SparseUnionType", + "DenseUnionType", + "TimestampType", + "Time32Type", + "Time64Type", + "DurationType", + "FixedSizeBinaryType", + "Decimal32Type", + "Decimal64Type", + "Decimal128Type", + "Decimal256Type", + "BaseExtensionType", + "ExtensionType", + "RunEndEncodedType", + "FixedShapeTensorType", + "Bool8Type", + "UuidType", + "JsonType", + "OpaqueType", + "UnknownExtensionType", + "register_extension_type", + "unregister_extension_type", + "DictionaryMemo", + "KeyValueMetadata", + "Field", + "Schema", + "schema", + "unify_schemas", + "Array", + "Tensor", + "array", + "arange", + "chunked_array", + "record_batch", + "nulls", + "repeat", + "SparseCOOTensor", + "SparseCSRMatrix", + "SparseCSCMatrix", + "SparseCSFTensor", + "infer_type", + "from_numpy_dtype", + "NullArray", + "NumericArray", + "IntegerArray", + "FloatingPointArray", + "BooleanArray", + "Int8Array", + "UInt8Array", + "Int16Array", + "UInt16Array", + "Int32Array", + "UInt32Array", + "Int64Array", + "UInt64Array", + "HalfFloatArray", + "FloatArray", + "DoubleArray", + "ListArray", + "LargeListArray", + "FixedSizeListArray", + "ListViewArray", + "LargeListViewArray", + "MapArray", + "UnionArray", + "BinaryArray", + "StringArray", + "LargeBinaryArray", + "LargeStringArray", + "BinaryViewArray", + "StringViewArray", + "FixedSizeBinaryArray", + "DictionaryArray", + "Date32Array", + "Date64Array", + "TimestampArray", + "Time32Array", + "Time64Array", + "DurationArray", + "MonthDayNanoIntervalArray", + "Decimal32Array", + "Decimal64Array", + "Decimal128Array", + "Decimal256Array", + "StructArray", + "ExtensionArray", + "Bool8Array", + "UuidArray", + "JsonArray", + "OpaqueArray", + "RunEndEncodedArray", + "FixedShapeTensorArray", + "scalar", + "NA", + "NULL", + "Scalar", + "NullScalar", + "BooleanScalar", + "Int8Scalar", + "Int16Scalar", + "Int32Scalar", + "Int64Scalar", + "UInt8Scalar", + "UInt16Scalar", + "UInt32Scalar", + "UInt64Scalar", + "HalfFloatScalar", + "FloatScalar", + "DoubleScalar", + "Decimal32Scalar", + "Decimal64Scalar", + "Decimal128Scalar", + "Decimal256Scalar", + "ListScalar", + "LargeListScalar", + "FixedSizeListScalar", + "ListViewScalar", + "LargeListViewScalar", + "Date32Scalar", + "Date64Scalar", + "Time32Scalar", + "Time64Scalar", + "TimestampScalar", + "DurationScalar", + "MonthDayNanoIntervalScalar", + "BinaryScalar", + "LargeBinaryScalar", + "BinaryViewScalar", + "StringScalar", + "LargeStringScalar", + "StringViewScalar", + "FixedSizeBinaryScalar", + "DictionaryScalar", + "MapScalar", + "StructScalar", + "UnionScalar", + "RunEndEncodedScalar", + "ExtensionScalar", + "Bool8Scalar", + "UuidScalar", + "JsonScalar", + "OpaqueScalar", + "DeviceAllocationType", + "Device", + "MemoryManager", + "default_cpu_memory_manager", + "Buffer", + "ResizableBuffer", + "foreign_buffer", + "py_buffer", + "Codec", + "compress", + "decompress", + "allocate_buffer", + "MemoryPool", + "LoggingMemoryPool", + "ProxyMemoryPool", + "total_allocated_bytes", + "set_memory_pool", + "default_memory_pool", + "system_memory_pool", + "jemalloc_memory_pool", + "mimalloc_memory_pool", + "logging_memory_pool", + "proxy_memory_pool", + "log_memory_allocations", + "jemalloc_set_decay_ms", + "supported_memory_backends", + "NativeFile", + "PythonFile", + "BufferedInputStream", + "BufferedOutputStream", + "CacheOptions", + "CompressedInputStream", + "CompressedOutputStream", + "TransformInputStream", + "transcoding_input_stream", + "FixedSizeBufferWriter", + "BufferReader", + "BufferOutputStream", + "OSFile", + "MemoryMappedFile", + "memory_map", + "create_memory_map", + "MockOutputStream", + "input_stream", + "output_stream", + "have_libhdfs", + "ChunkedArray", + "RecordBatch", + "Table", + "table", + "concat_arrays", + "concat_batches", + "concat_tables", + "TableGroupBy", + "RecordBatchReader", + "ArrowCancelled", + "ArrowCapacityError", + "ArrowException", + "ArrowKeyError", + "ArrowIndexError", + "ArrowInvalid", + "ArrowIOError", + "ArrowMemoryError", + "ArrowNotImplementedError", + "ArrowTypeError", + "ArrowSerializationError", + "serialize_pandas", + "deserialize_pandas", + "lib", + "ipc", + "types", + "_deprecate_api", + "_deprecate_class", + "Message", + "MessageReader", + "MetadataVersion", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "get_include", + "_get_pkg_config_executable", + "compute", + "feather", + "csv", + "json", + "_has_pkg_config", + "_read_pkg_config_variable", + "get_libraries", + "create_library_symlinks", + "dataset", + "get_library_dirs", +] diff --git a/python/pyarrow-stubs/pyarrow/tests/util.pyi b/python/pyarrow-stubs/pyarrow/tests/util.pyi new file mode 100644 index 000000000000..5ceb784588a7 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/tests/util.pyi @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Callable +from contextlib import AbstractContextManager +from decimal import Decimal +from os import PathLike +from typing import Any, Literal +import socket + +import pyarrow.fs + + +def randsign() -> int: ... +def random_seed(seed: int) -> AbstractContextManager[None]: ... +def randdecimal(precision: int, scale: int) -> Decimal: ... +def random_ascii(length: int) -> bytes: ... +def rands(nchars: int) -> str: ... +def get_modified_env_with_pythonpath() -> dict[str, str]: ... +def invoke_script(script_name: str, *args: str) -> None: ... +def changed_environ(name: str, value: str) -> AbstractContextManager[None]: ... +def change_cwd(path: str | PathLike[str]) -> AbstractContextManager[None]: ... +def disabled_gc() -> AbstractContextManager[None]: ... +def _filesystem_uri(path: str) -> str: ... + + +def memory_leak_check( + f: Callable[[], Any], + metric: Literal['rss', 'vms', 'shared'] = 'rss', + threshold: int = 131072, + iterations: int = 10, + check_interval: int = 1 +) -> None: ... + + +class FSProtocolClass: + def __init__(self, path: str | PathLike[str]) -> None: ... + def __fspath__(self) -> str: ... + + +class ProxyHandler(pyarrow.fs.FileSystemHandler): + _fs: pyarrow.fs.FileSystem + def __init__(self, fs: pyarrow.fs.FileSystem) -> None: ... + def __eq__(self, other: object) -> bool: ... + def __ne__(self, other: object) -> bool: ... + def get_type_name(self) -> str: ... + def normalize_path(self, path: str) -> str: ... + def get_file_info(self, paths: list[str]) -> list[pyarrow.fs.FileInfo]: ... + def get_file_info_selector( + self, selector: pyarrow.fs.FileSelector) -> list[pyarrow.fs.FileInfo]: ... + + def create_dir(self, path: str, recursive: bool) -> None: ... + def delete_dir(self, path: str) -> None: ... + def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: ... + def delete_root_dir_contents(self) -> None: ... + def delete_file(self, path: str) -> None: ... + def move(self, src: str, dest: str) -> None: ... + def copy_file(self, src: str, dest: str) -> None: ... + def open_input_stream(self, path: str) -> Any: ... + def open_input_file(self, path: str) -> Any: ... + def open_output_stream(self, path: str, metadata: dict[str, str]) -> Any: ... + def open_append_stream(self, path: str, metadata: dict[str, str]) -> Any: ... + + +def _ensure_minio_component_version(component: str, minimum_year: int) -> bool: ... +def _run_mc_command(mcdir: str, *args: str) -> None: ... +def windows_has_tzdata() -> bool: ... +def running_on_musllinux() -> bool: ... + + +def signal_wakeup_fd( + *, warn_on_full_buffer: bool = False) -> AbstractContextManager[socket.socket]: ... + + +def _configure_s3_limited_user( + s3_server: dict[str, Any], policy: str, username: str, password: str) -> None: ... + + +def _wait_for_minio_startup( + mcdir: str, address: str, access_key: str, secret_key: str) -> None: ... diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 18a40d877c34..39abd3ee5715 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -35,11 +35,11 @@ import sys as _sys try: - from ._generated_version import version as __version__ + from ._generated_version import version as __version__ # type: ignore[import-untyped, import-not-found] # noqa: E501 except ImportError: # Package is not installed, parse git tag at runtime try: - import setuptools_scm + import setuptools_scm # type: ignore[import-not-found, import-untyped] # Code duplicated from setup.py to avoid a dependency on each other def parse_git(root, **kwargs): @@ -47,14 +47,14 @@ def parse_git(root, **kwargs): Parse function for setuptools_scm that ignores tags for non-C++ subprojects, e.g. apache-arrow-js-XXX tags. """ - from setuptools_scm.git import parse + from setuptools_scm.git import parse # type: ignore[import-not-found, import-untyped] # noqa: E501 kwargs['describe_command'] = \ "git describe --dirty --tags --long --match 'apache-arrow-[0-9]*.*'" return parse(root, **kwargs) __version__ = setuptools_scm.get_version('../', parse=parse_git) except ImportError: - __version__ = None + __version__ = None # type: ignore[assignment] from pyarrow.lib import (BuildInfo, CppBuildInfo, RuntimeInfo, set_timezone_db_path, MonthDayNano, VersionInfo, build_info, cpp_build_info, @@ -150,6 +150,8 @@ def print_entry(label, value): print(f" {codec: <20}: {status: <8}") +from pyarrow.lib import ( + DataType, Array, MemoryPool) # type: ignore[reportAttributeAccessIssue] from pyarrow.lib import (null, bool_, int8, int16, int32, int64, uint8, uint16, uint32, uint64, @@ -167,7 +169,7 @@ def print_entry(label, value): bool8, fixed_shape_tensor, json_, opaque, uuid, field, type_for_alias, - DataType, DictionaryType, StructType, + DictionaryType, StructType, ListType, LargeListType, FixedSizeListType, ListViewType, LargeListViewType, MapType, UnionType, SparseUnionType, DenseUnionType, @@ -184,8 +186,7 @@ def print_entry(label, value): Field, Schema, schema, - unify_schemas, - Array, Tensor, + unify_schemas, Tensor, array, chunked_array, record_batch, nulls, repeat, SparseCOOTensor, SparseCSRMatrix, SparseCSCMatrix, SparseCSFTensor, @@ -240,7 +241,7 @@ def print_entry(label, value): from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer, Codec, compress, decompress, allocate_buffer) -from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool, +from pyarrow.lib import (LoggingMemoryPool, ProxyMemoryPool, total_allocated_bytes, set_memory_pool, default_memory_pool, system_memory_pool, jemalloc_memory_pool, mimalloc_memory_pool, @@ -362,7 +363,7 @@ def create_library_symlinks(): if _sys.platform == 'linux': bundled_libs = glob.glob(_os.path.join(package_cwd, '*.so.*')) - def get_symlink_path(hard_path): + def get_symlink_path(hard_path): # type: ignore[reportRedeclaration] return hard_path.rsplit('.', 1)[0] else: bundled_libs = glob.glob(_os.path.join(package_cwd, '*.*.dylib')) diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py index 41beaa140419..0e8ef66485ec 100644 --- a/python/pyarrow/conftest.py +++ b/python/pyarrow/conftest.py @@ -114,13 +114,13 @@ defaults['timezone_data'] = os.path.exists("/usr/share/zoneinfo") try: - import cython # noqa + import cython # type: ignore[import-untyped, import-not-found] # noqa defaults['cython'] = True except ImportError: pass try: - import fastparquet # noqa + import fastparquet # type: ignore[import-untyped, import-not-found] # noqa defaults['fastparquet'] = True except ImportError: pass @@ -347,7 +347,7 @@ def func(ctx, x): pc.register_aggregate_function(func, func_name, - func_doc, + func_doc, # type: ignore { "x": pa.float64(), }, diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 575444c1cfc2..3f227d3101c7 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -64,7 +64,8 @@ if os.environ.get('TZDIR', None) is None: from importlib import resources try: - os.environ['TZDIR'] = os.path.join(resources.files('tzdata'), 'zoneinfo') + tzdata_path = resources.files('tzdata') + os.environ['TZDIR'] = os.path.join(str(tzdata_path), 'zoneinfo') except ModuleNotFoundError: print( 'Package "tzdata" not found. Not setting TZDIR environment variable.' @@ -191,6 +192,7 @@ def decorate(func): def wrapper(*args, **kwargs): remaining_attempts = attempts curr_delay = delay + last_exception = None while remaining_attempts > 0: try: return func(*args, **kwargs) @@ -201,6 +203,9 @@ def wrapper(*args, **kwargs): if max_delay: curr_delay = min(curr_delay, max_delay) time.sleep(curr_delay) + # At this point, we've exhausted all attempts and last_exception must be set + # (since we must have caught at least one exception to exit the loop) + assert last_exception is not None, "No attempts were made" raise last_exception return wrapper return decorate diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index 3c31650ddf94..9188d5d41cc2 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -17,31 +17,32 @@ import datetime import sys +from typing import Any -import pytest -import hypothesis as h -import hypothesis.strategies as st +import pytest # type: ignore[import-not-found] +import hypothesis as h # type: ignore[import-not-found] +import hypothesis.strategies as st # type: ignore[import-not-found] try: - import hypothesis.extra.numpy as npst + import hypothesis.extra.numpy as npst # type: ignore[import-not-found] except ImportError: - npst = None + npst = None # type: ignore[assignment] try: - import hypothesis.extra.pytz as tzst + import hypothesis.extra.pytz as tzst # type: ignore[import-not-found] except ImportError: - tzst = None + tzst = None # type: ignore[assignment] try: import zoneinfo except ImportError: - zoneinfo = None + zoneinfo = None # type: ignore[assignment] if sys.platform == 'win32': try: - import tzdata # noqa:F401 + import tzdata # type: ignore[import-not-found, import-untyped] # noqa:F401 except ImportError: - zoneinfo = None + zoneinfo = None # type: ignore[assignment] try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa @@ -151,12 +152,12 @@ timezones = st.one_of(st.none(), st.timezones()) else: timezones = st.none() -timestamp_types = st.builds( +timestamp_types: Any = st.builds( pa.timestamp, unit=st.sampled_from(['s', 'ms', 'us', 'ns']), tz=timezones ) -duration_types = st.builds( +duration_types: Any = st.builds( pa.duration, st.sampled_from(['s', 'ms', 'us', 'ns']) ) @@ -253,13 +254,13 @@ def schemas(type_strategy=primitive_types, max_fields=None): all_types = st.deferred( lambda: ( - primitive_types | - list_types() | - struct_types() | - dictionary_types() | - map_types() | - list_types(all_types) | - struct_types(all_types) + primitive_types + | list_types() + | struct_types() + | dictionary_types() + | map_types() + | list_types(all_types) # type: ignore[has-type] + | struct_types(all_types) # type: ignore[has-type] ) ) all_fields = st.one_of( @@ -303,6 +304,7 @@ def arrays(draw, type, size=None, nullable=True): elif not isinstance(size, int): raise TypeError('Size must be an integer') + assert npst is not None if pa.types.is_null(ty): h.assume(nullable) value = st.none() @@ -315,6 +317,7 @@ def arrays(draw, type, size=None, nullable=True): values = draw(npst.arrays(ty.to_pandas_dtype(), shape=(size,))) # Workaround ARROW-4952: no easy way to assert array equality # in a NaN-tolerant way. + assert np is not None values[np.isnan(values)] = -42.0 return pa.array(values, type=ty) elif pa.types.is_decimal(ty): @@ -340,9 +343,11 @@ def arrays(draw, type, size=None, nullable=True): offset = ty.tz.split(":") offset_hours = int(offset[0]) offset_min = int(offset[1]) - tz = datetime.timedelta(hours=offset_hours, minutes=offset_min) + tz = datetime.timezone( + datetime.timedelta(hours=offset_hours, minutes=offset_min) + ) except ValueError: - tz = zoneinfo.ZoneInfo(ty.tz) + tz = zoneinfo.ZoneInfo(str(ty.tz)) value = st.datetimes(timezones=st.just(tz), min_value=min_datetime, max_value=max_datetime) elif pa.types.is_duration(ty): @@ -501,7 +506,9 @@ def pandas_compatible_list_types( dictionary_types( value_strategy=pandas_compatible_dictionary_value_types ), - pandas_compatible_list_types(pandas_compatible_types), - struct_types(pandas_compatible_types) + pandas_compatible_list_types( + pandas_compatible_types # type: ignore[has-type] + ), + struct_types(pandas_compatible_types) # type: ignore[has-type] ) ) diff --git a/python/pyarrow/tests/test_adhoc_memory_leak.py b/python/pyarrow/tests/test_adhoc_memory_leak.py index 76a766984dab..9f61bc7ddfea 100644 --- a/python/pyarrow/tests/test_adhoc_memory_leak.py +++ b/python/pyarrow/tests/test_adhoc_memory_leak.py @@ -20,7 +20,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.tests.util as test_util diff --git a/python/pyarrow/tests/test_cpp_internals.py b/python/pyarrow/tests/test_cpp_internals.py index 7508d8f0b981..7d652acf62f1 100644 --- a/python/pyarrow/tests/test_cpp_internals.py +++ b/python/pyarrow/tests/test_cpp_internals.py @@ -20,7 +20,8 @@ import pytest -from pyarrow._pyarrow_cpp_tests import get_cpp_tests +from pyarrow._pyarrow_cpp_tests import ( # type: ignore[import-not-found, import-untyped] # noqa: E501 + get_cpp_tests) def inject_cpp_tests(ns): diff --git a/python/pyarrow/tests/test_cython.py b/python/pyarrow/tests/test_cython.py index a142e66db567..11ef01412a6f 100644 --- a/python/pyarrow/tests/test_cython.py +++ b/python/pyarrow/tests/test_cython.py @@ -89,7 +89,7 @@ def test_cython_api(tmpdir): Basic test for the Cython API. """ # Fail early if cython is not found - import cython # noqa + import cython # type: ignore[import-untyped, import-not-found] # noqa with tmpdir.as_cwd(): # Set up temporary workspace diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index ebac37e862b6..941e73c8167a 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -22,12 +22,13 @@ import weakref from uuid import uuid4, UUID import sys +from typing import cast import pytest try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa from pyarrow.vendored.version import Version @@ -79,12 +80,14 @@ def __init__(self): def __arrow_ext_serialize__(self): # XXX pa.BaseExtensionType should expose C++ serialization method + assert isinstance(self.storage_type, IntegerType) return self.storage_type.__arrow_ext_serialize__() @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): + assert isinstance(storage_type, IntegerType) deserialized_storage_type = storage_type.__arrow_ext_deserialize__( - serialized) + storage_type, serialized) assert deserialized_storage_type == storage_type return cls() @@ -160,7 +163,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): class MyStructType(pa.ExtensionType): - storage_type = pa.struct([('left', pa.int64()), + storage_type = pa.struct([('left', pa.int64()), # type: ignore[assignment] ('right', pa.int64())]) def __init__(self): @@ -221,7 +224,7 @@ def __arrow_ext_serialize__(self): @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): assert serialized == b'' - return cls(storage_type) + return cls(storage_type, annotation=None) def ipc_write_batch(batch): @@ -432,8 +435,8 @@ def test_ext_array_wrap_array(): arr.validate(full=True) assert isinstance(arr, pa.ChunkedArray) assert arr.type == ty - assert arr.chunk(0).storage == storage.chunk(0) - assert arr.chunk(1).storage == storage.chunk(1) + assert arr.chunk(0).storage == storage.chunk(0) # type: ignore[union-attr] + assert arr.chunk(1).storage == storage.chunk(1) # type: ignore[union-attr] # Wrong storage type storage = pa.array([b"foo", b"bar", None]) @@ -442,7 +445,7 @@ def test_ext_array_wrap_array(): # Not an array or chunked array with pytest.raises(TypeError, match="Expected array or chunked array"): - ty.wrap_array(None) + ty.wrap_array(None) # type: ignore[arg-type] def test_ext_scalar_from_array(): @@ -876,7 +879,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): def __eq__(self, other): if isinstance(other, pa.BaseExtensionType): return (isinstance(self, type(other)) and - self.freq == other.freq) + self.freq == other.freq) # type: ignore[attr-defined] else: return NotImplemented @@ -902,7 +905,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): storage_type, serialized).freq return PeriodTypeWithToPandasDtype(freq) - def to_pandas_dtype(self): + def to_pandas_dtype(self): # type: ignore[override] import pandas as pd return pd.PeriodDtype(freq=self.freq) @@ -1033,7 +1036,7 @@ def test_generic_ext_array_pickling(registered_period_type, pickle_module): def test_generic_ext_type_register(registered_period_type): # test that trying to register other type does not segfault with pytest.raises(TypeError): - pa.register_extension_type(pa.string()) + pa.register_extension_type(pa.string()) # type: ignore[arg-type] # register second time raises KeyError period_type = PeriodType('D') @@ -1058,11 +1061,13 @@ def test_parquet_period(tmpdir, registered_period_type): # in the serialized arrow schema meta = pq.read_metadata(filename) assert meta.schema.column(0).physical_type == "INT64" + assert meta.metadata is not None assert b"ARROW:schema" in meta.metadata import base64 decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"]) - schema = pa.ipc.read_schema(pa.BufferReader(decoded_schema)) + schema = pa.ipc.read_schema(pa.BufferReader( + decoded_schema)) # Since the type could be reconstructed, the extension type metadata is # absent. assert schema.field("ext").metadata == {} @@ -1434,6 +1439,7 @@ def test_tensor_class_methods(np_type_str): storage = pa.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], pa.list_(arrow_type, 6)) arr = pa.ExtensionArray.from_storage(tensor_type, storage) + arr = cast(pa.FixedShapeTensorArray, arr) expected = np.array( [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=np.dtype(np_type_str) @@ -1442,7 +1448,7 @@ def test_tensor_class_methods(np_type_str): np.testing.assert_array_equal(arr.to_numpy_ndarray(), expected) expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=np.dtype(np_type_str)) - result = arr[1:].to_numpy_ndarray() + result = arr[1:].to_numpy_ndarray() # type: ignore[union-attr] np.testing.assert_array_equal(result, expected) values = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]] @@ -1452,35 +1458,43 @@ def test_tensor_class_methods(np_type_str): tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 1, 2]) result = pa.ExtensionArray.from_storage(tensor_type, storage) + result = cast(pa.FixedShapeTensorArray, result) expected = np.array( [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]], dtype=np.dtype(np_type_str) ) np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) - result = flat_arr.reshape(1, 2, 3, 2) + result_reshaped = flat_arr.reshape(1, 2, 3, 2) expected = np.array( [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=np.dtype(np_type_str) ) - np.testing.assert_array_equal(result, expected) + np.testing.assert_array_equal(result_reshaped, expected) tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 2, 1]) result = pa.ExtensionArray.from_storage(tensor_type, storage) + result = cast(pa.FixedShapeTensorArray, result) expected = as_strided(flat_arr, shape=(1, 2, 3, 2), strides=(bw * 12, bw * 6, bw, bw * 3)) np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[2, 0, 1]) - result = pa.ExtensionArray.from_storage(tensor_type, storage) + result = pa.ExtensionArray.from_storage( + tensor_type, storage) # type: ignore[assignment] expected = as_strided(flat_arr, shape=(1, 3, 2, 2), strides=(bw * 12, bw, bw * 6, bw * 2)) - np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) - - assert result.type.permutation == [2, 0, 1] - assert result.type.shape == [2, 2, 3] + np.testing.assert_array_equal( + result.to_numpy_ndarray(), expected) # type: ignore[union-attr] + + result_type = result.type + assert isinstance(result, pa.FixedShapeTensorArray) + assert isinstance(result_type, pa.FixedShapeTensorType) + assert result_type.permutation == [2, 0, 1] + assert result_type.shape == [2, 2, 3] assert result.to_tensor().shape == (1, 3, 2, 2) - assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, 2 * bw) + assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, + 2 * bw) @pytest.mark.numpy @@ -1508,17 +1522,23 @@ def test_tensor_array_from_numpy(np_type_str): arr = flat_arr.reshape(1, 3, 4) tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) - assert tensor_array_from_numpy.type.shape == [3, 4] - assert tensor_array_from_numpy.type.permutation == [0, 1] - assert tensor_array_from_numpy.type.dim_names is None + result_type = tensor_array_from_numpy.type + assert isinstance(tensor_array_from_numpy, pa.FixedShapeTensorArray) + assert isinstance(result_type, pa.FixedShapeTensorType) + assert result_type.shape == [3, 4] + assert result_type.permutation == [0, 1] + assert result_type.dim_names is None assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr) arr = as_strided(flat_arr, shape=(1, 2, 3, 2), strides=(bw * 12, bw * 6, bw, bw * 3)) tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) - assert tensor_array_from_numpy.type.shape == [2, 2, 3] - assert tensor_array_from_numpy.type.permutation == [0, 2, 1] - assert tensor_array_from_numpy.type.dim_names is None + result_type = tensor_array_from_numpy.type + assert isinstance(tensor_array_from_numpy, pa.FixedShapeTensorArray) + assert isinstance(result_type, pa.FixedShapeTensorType) + assert result_type.shape == [2, 2, 3] + assert result_type.permutation == [0, 2, 1] + assert result_type.dim_names is None assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr) arr = flat_arr.reshape(1, 2, 3, 2) @@ -1532,7 +1552,8 @@ def test_tensor_array_from_numpy(np_type_str): arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], dtype=np.dtype(np_type_str)) expected = arr[1:] - result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)[1:].to_numpy_ndarray() + result = cast(pa.FixedShapeTensorArray, pa.FixedShapeTensorArray.from_numpy_ndarray( + arr)[1:]).to_numpy_ndarray() np.testing.assert_array_equal(result, expected) arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=np.dtype(np_type_str)) @@ -1559,22 +1580,27 @@ def test_tensor_array_from_numpy(np_type_str): dim_names = ["a", "b"] tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray( arr, dim_names=dim_names) - assert tensor_array_from_numpy.type.value_type == arrow_type - assert tensor_array_from_numpy.type.shape == [2, 3] - assert tensor_array_from_numpy.type.dim_names == dim_names + result_type = tensor_array_from_numpy.type + assert isinstance(tensor_array_from_numpy, pa.FixedShapeTensorArray) + assert isinstance(result_type, pa.FixedShapeTensorType) + assert result_type.value_type == arrow_type + assert result_type.shape == [2, 3] + assert result_type.dim_names == dim_names with pytest.raises(ValueError, match="The length of dim_names"): pa.FixedShapeTensorArray.from_numpy_ndarray(arr, dim_names=['only_one']) with pytest.raises(TypeError, match="dim_names must be a tuple or list"): - pa.FixedShapeTensorArray.from_numpy_ndarray(arr, dim_names=123) + pa.FixedShapeTensorArray.from_numpy_ndarray( + arr, dim_names=123) # type: ignore[arg-type] with pytest.raises(TypeError, match="dim_names must be a tuple or list"): pa.FixedShapeTensorArray.from_numpy_ndarray( - arr, dim_names=(x for x in range(2))) + arr, dim_names=(x for x in range(2))) # type: ignore[arg-type] with pytest.raises(TypeError, match="Each element of dim_names must be a string"): - pa.FixedShapeTensorArray.from_numpy_ndarray(arr, dim_names=[0, 1]) + pa.FixedShapeTensorArray.from_numpy_ndarray( + arr, dim_names=[0, 1]) # type: ignore[arg-type] @pytest.mark.numpy @@ -1845,14 +1871,18 @@ def test_bool8_to_numpy_conversion(): assert np.array_equal(arr_to_np, np_arr_no_nulls) # same underlying buffer - assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address + buffer = arr_no_nulls.buffers()[1] + assert buffer is not None + assert arr_to_np.ctypes.data == buffer.address # if the user requests a writable array, a copy should be performed arr_to_np_writable = arr_no_nulls.to_numpy(zero_copy_only=False, writable=True) assert np.array_equal(arr_to_np_writable, np_arr_no_nulls) # different underlying buffer - assert arr_to_np_writable.ctypes.data != arr_no_nulls.buffers()[1].address + buffer = arr_no_nulls.buffers()[1] + assert buffer is not None + assert arr_to_np_writable.ctypes.data != buffer.address @pytest.mark.numpy @@ -1867,7 +1897,9 @@ def test_bool8_from_numpy_conversion(): assert arr_from_np == canonical_bool8_arr_no_nulls # same underlying buffer - assert arr_from_np.buffers()[1].address == np_arr_no_nulls.ctypes.data + buffer = arr_from_np.buffers()[1] + assert buffer is not None + assert buffer.address == np_arr_no_nulls.ctypes.data # conversion only valid for 1-D arrays with pytest.raises( @@ -1882,7 +1914,7 @@ def test_bool8_from_numpy_conversion(): ValueError, match="Cannot convert 0-D array to bool8 array", ): - pa.Bool8Array.from_numpy(np.bool_()) + pa.Bool8Array.from_numpy(np.bool_(False)) # type: ignore[arg-type] # must use compatible storage type with pytest.raises( diff --git a/python/pyarrow/tests/test_gdb.py b/python/pyarrow/tests/test_gdb.py index 912953ae60d2..50d81b686aca 100644 --- a/python/pyarrow/tests/test_gdb.py +++ b/python/pyarrow/tests/test_gdb.py @@ -101,6 +101,8 @@ def wait_until_ready(self): Record output until the gdb prompt displays. Return recorded output. """ # TODO: add timeout? + assert self.proc is not None + assert self.proc.stdout is not None while (not self.last_stdout_line.startswith(b"(gdb) ") and self.proc.poll() is None): block = self.proc.stdout.read(4096) @@ -125,6 +127,8 @@ def wait_until_ready(self): return out def issue_command(self, line): + assert self.proc is not None + assert self.proc.stdin is not None line = line.encode('utf-8') + b"\n" if self.verbose: sys.stdout.buffer.write(line) @@ -158,6 +162,7 @@ def select_frame(self, func_name): m = re.search(pat, out) if m is None: pytest.fail(f"Could not select frame for function {func_name}") + return # Never reached, but helps type checker frame_num = int(m[1]) out = self.run_command(f"frame {frame_num}") @@ -165,6 +170,8 @@ def select_frame(self, func_name): def join(self): if self.proc is not None: + assert self.proc.stdin is not None + assert self.proc.stdout is not None self.proc.stdin.close() self.proc.stdout.close() # avoid ResourceWarning self.proc.kill() diff --git a/python/pyarrow/tests/test_jvm.py b/python/pyarrow/tests/test_jvm.py index d2ba780efc7f..b5d4e74f126f 100644 --- a/python/pyarrow/tests/test_jvm.py +++ b/python/pyarrow/tests/test_jvm.py @@ -38,11 +38,13 @@ def root_allocator(): arrow_dir = os.path.join(os.path.dirname(__file__), '..', '..', '..') pom_path = os.path.join(arrow_dir, 'java', 'pom.xml') tree = ET.parse(pom_path) - version = tree.getroot().find( + version_element = tree.getroot().find( 'POM:version', namespaces={ 'POM': 'http://maven.apache.org/POM/4.0.0' - }).text + }) + assert version_element is not None + version = version_element.text jar_path = os.path.join( arrow_dir, 'java', 'tools', 'target', f'arrow-tools-{version}-jar-with-dependencies.jar') @@ -76,8 +78,8 @@ def test_jvm_buffer(root_allocator): def test_jvm_buffer_released(root_allocator): - import jpype.imports # noqa - from java.lang import IllegalArgumentException + import jpype.imports # type: ignore[import-untyped, import-not-found] # noqa + from java.lang import IllegalArgumentException # type: ignore[import-not-found] jvm_buffer = root_allocator.buffer(8) jvm_buffer.release() diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 65f0c6081363..20a33a382e41 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -20,11 +20,12 @@ import pytest import weakref from collections.abc import Sequence, Mapping +from typing import cast try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.compute as pc @@ -68,7 +69,7 @@ pa.Time32Scalar), (datetime.datetime.now().time(), None, pa.Time64Scalar), (datetime.timedelta(days=1), None, pa.DurationScalar), - (pa.MonthDayNano([1, -1, -10100]), None, + (pa.MonthDayNano([1, -1, -10100]), None, # type: ignore[call-arg, arg-type] pa.MonthDayNanoIntervalScalar), ({'a': 1, 'b': [1, 2]}, None, pa.StructScalar), ([('a', 1), ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar), @@ -360,7 +361,8 @@ def test_time_from_datetime_time(): def test_temporal_values(value, time_type: pa.DataType): time_scalar = pa.scalar(value, type=time_type) time_scalar.validate(full=True) - assert time_scalar.value == value + assert (time_scalar.value # type: ignore[union-attr, reportAttributeAccessIssue] + == value) def test_cast(): @@ -422,7 +424,9 @@ def test_timestamp(): expected = pd.Timestamp('2000-01-01 12:34:56') assert arrow_arr[0].as_py() == expected - assert arrow_arr[0].value * 1000**i == expected.value + value = cast(pa.TimestampScalar, arrow_arr[0]).value + assert value is not None + assert value * 1000**i == expected.value tz = 'America/New_York' arrow_type = pa.timestamp(unit, tz=tz) @@ -434,7 +438,9 @@ def test_timestamp(): .tz_convert(tz)) assert arrow_arr[0].as_py() == expected - assert arrow_arr[0].value * 1000**i == expected.value + value = cast(pa.TimestampScalar, arrow_arr[0]).value + assert value is not None + assert value * 1000**i == expected.value @pytest.mark.nopandas @@ -529,7 +535,7 @@ def test_duration_nanos_nopandas(): def test_month_day_nano_interval(): - triple = pa.MonthDayNano([-3600, 1800, -50]) + triple = pa.MonthDayNano([-3600, 1800, -50]) # type: ignore[invalid-argument-type] arr = pa.array([triple]) assert isinstance(arr[0].as_py(), pa.MonthDayNano) assert arr[0].as_py() == triple @@ -577,7 +583,7 @@ def test_binary(value, ty, scalar_typ): with pytest.raises(ValueError): memoryview(s) else: - assert buf.to_pybytes() == value + assert buf.to_pybytes() == value # type: ignore[union-attr] assert isinstance(buf, pa.Buffer) assert bytes(s) == value @@ -852,7 +858,7 @@ def test_dictionary(pickle_module): assert arr.to_pylist() == expected for j, (i, v) in enumerate(zip(indices, expected)): - s = arr[j] + s = cast(pa.DictionaryScalar, arr[j]) assert s.as_py() == v assert s.value.as_py() == v @@ -868,14 +874,14 @@ def test_run_end_encoded(): values = [1, 2, 1, None, 3] arr = pa.RunEndEncodedArray.from_arrays(run_ends, values) - scalar = arr[0] + scalar = cast(pa.RunEndEncodedScalar, arr[0]) assert isinstance(scalar, pa.RunEndEncodedScalar) assert isinstance(scalar.value, pa.Int64Scalar) assert scalar.value == pa.array(values)[0] assert scalar.as_py() == 1 # null -> .value is still a scalar, as_py returns None - scalar = arr[10] + scalar = cast(pa.RunEndEncodedScalar, arr[10]) assert isinstance(scalar.value, pa.Int64Scalar) assert scalar.as_py() is None @@ -901,13 +907,13 @@ def test_union(pickle_module): with pytest.raises(pa.ArrowNotImplementedError): pickle_module.loads(pickle_module.dumps(s)) - assert arr[0].type_code == 0 + assert cast(pa.UnionScalar, arr[0]).type_code == 0 assert arr[0].as_py() == "a" - assert arr[1].type_code == 0 + assert cast(pa.UnionScalar, arr[1]).type_code == 0 assert arr[1].as_py() == "b" - assert arr[2].type_code == 1 + assert cast(pa.UnionScalar, arr[2]).type_code == 1 assert arr[2].as_py() == 3 - assert arr[3].type_code == 1 + assert cast(pa.UnionScalar, arr[3]).type_code == 1 assert arr[3].as_py() == 4 # dense @@ -927,9 +933,9 @@ def test_union(pickle_module): with pytest.raises(pa.ArrowNotImplementedError): pickle_module.loads(pickle_module.dumps(s)) - assert arr[0].type_code == 0 + assert cast(pa.UnionScalar, arr[0]).type_code == 0 assert arr[0].as_py() == b'a' - assert arr[5].type_code == 1 + assert cast(pa.UnionScalar, arr[5]).type_code == 1 assert arr[5].as_py() == 3 diff --git a/python/pyarrow/tests/test_strategies.py b/python/pyarrow/tests/test_strategies.py index babb839b534e..9505b9a11b04 100644 --- a/python/pyarrow/tests/test_strategies.py +++ b/python/pyarrow/tests/test_strategies.py @@ -25,7 +25,7 @@ @h.given(past.all_types) def test_types(ty): - assert isinstance(ty, pa.lib.DataType) + assert isinstance(ty, pa.DataType) @h.given(past.all_fields) @@ -41,7 +41,7 @@ def test_schemas(schema): @pytest.mark.numpy @h.given(past.all_arrays) def test_arrays(array): - assert isinstance(array, pa.lib.Array) + assert isinstance(array, pa.Array) @pytest.mark.numpy diff --git a/python/pyarrow/tests/test_without_numpy.py b/python/pyarrow/tests/test_without_numpy.py index 55c12602ce89..c5f5671aabc8 100644 --- a/python/pyarrow/tests/test_without_numpy.py +++ b/python/pyarrow/tests/test_without_numpy.py @@ -50,6 +50,7 @@ def test_tensor_to_np(): arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] storage = pa.array(arr, pa.list_(pa.int32(), 4)) tensor_array = pa.ExtensionArray.from_storage(tensor_type, storage) + assert isinstance(tensor_array, pa.FixedShapeTensorArray) tensor = tensor_array.to_tensor() msg = "Cannot return a numpy.ndarray if NumPy is not present" diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py index 7e3dd4324e93..fca0fec1122a 100644 --- a/python/pyarrow/tests/util.py +++ b/python/pyarrow/tests/util.py @@ -171,7 +171,8 @@ def get_modified_env_with_pythonpath(): existing_pythonpath = env.get('PYTHONPATH', '') module_path = os.path.abspath( - os.path.dirname(os.path.dirname(pa.__file__))) + os.path.dirname(os.path.dirname( # type: ignore[no-matching-overload] + pa.__file__))) if existing_pythonpath: new_pythonpath = os.pathsep.join((module_path, existing_pythonpath)) @@ -336,6 +337,7 @@ def _ensure_minio_component_version(component, minimum_year): stderr=subprocess.PIPE, encoding='utf-8') as proc: if proc.wait(10) != 0: return False + assert proc.stdout is not None stdout = proc.stdout.read() pattern = component + r' version RELEASE\.(\d+)-.*' version_match = re.search(pattern, stdout) @@ -367,6 +369,8 @@ def _run_mc_command(mcdir, *args): cmd_str = ' '.join(full_args) print(f'Cmd: {cmd_str}') print(f' Return: {retval}') + assert proc.stdout is not None + assert proc.stderr is not None print(f' Stdout: {proc.stdout.read()}') print(f' Stderr: {proc.stderr.read()}') if retval != 0: diff --git a/python/pyarrow/vendored/docscrape.py b/python/pyarrow/vendored/docscrape.py index 6c4d6e01400b..47aeeed40aed 100644 --- a/python/pyarrow/vendored/docscrape.py +++ b/python/pyarrow/vendored/docscrape.py @@ -18,7 +18,7 @@ import sys -def strip_blank_lines(l): +def strip_blank_lines(l): # noqa: E741 "Remove leading and trailing blank lines from a list of lines" while l and not l[0].strip(): del l[0] @@ -62,7 +62,7 @@ def read(self): return '' def seek_next_non_empty_line(self): - for l in self[self._l:]: + for l in self[self._l:]: # noqa: E741 if l.strip(): break else: @@ -185,8 +185,9 @@ def _is_at_section(self): l2 = self._doc.peek(1).strip() # ---------- or ========== if len(l2) >= 3 and (set(l2) in ({'-'}, {'='})) and len(l2) != len(l1): snip = '\n'.join(self._doc._str[:2])+'...' - self._error_location("potentially wrong underline length... \n%s \n%s in \n%s" - % (l1, l2, snip), error=False) + self._error_location( + "potentially wrong underline length... \n%s \n%s in \n%s" + % (l1, l2, snip), error=False) return l2.startswith('-'*len(l1)) or l2.startswith('='*len(l1)) def _strip(self, doc): diff --git a/python/pyproject.toml b/python/pyproject.toml index 217dba81b873..19b2186e21ee 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -99,38 +99,26 @@ version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' fallback_version = '24.0.0a0' -# TODO: Enable type checking once stubs are merged [tool.mypy] -files = ["pyarrow-stubs"] +files = ["pyarrow", "pyarrow-stubs"] mypy_path = "$MYPY_CONFIG_FILE_DIR/pyarrow-stubs" -exclude = [ - "^pyarrow/", - "^benchmarks/", - "^examples/", - "^scripts/", -] +exclude = 'pyarrow/interchange/.*|pyarrow/tests/interchange/.*|pyarrow/vendored/.*|pyarrow/tests/test_cuda*' -# TODO: Enable type checking once stubs are merged [tool.pyright] pythonPlatform = "All" pythonVersion = "3.10" -include = ["pyarrow-stubs"] -exclude = [ - "pyarrow", - "benchmarks", - "examples", - "scripts", - "build", -] +include = ["pyarrow", "pyarrow-stubs"] +exclude = ["pyarrow/vendored", "pyarrow/interchange", "pyarrow/tests/interchange", "pyarrow/tests/test_cuda*"] stubPath = "pyarrow-stubs" typeCheckingMode = "basic" -# TODO: Enable type checking once stubs are merged [tool.ty.src] -include = ["pyarrow-stubs"] -exclude = [ - "pyarrow", - "benchmarks", - "examples", - "scripts", -] +include = ["pyarrow", "pyarrow-stubs"] +exclude = ["pyarrow/vendored", "pyarrow/interchange", "pyarrow/tests/interchange", "pyarrow/tests/test_cuda*"] + +[tool.ty.environment] +root = ["pyarrow"] + +[tool.ty.rules] +unresolved-import = "ignore" +unresolved-attribute = "ignore" diff --git a/python/scripts/run_emscripten_tests.py b/python/scripts/run_emscripten_tests.py index 406dfc54e4fc..e54f0c223ab4 100644 --- a/python/scripts/run_emscripten_tests.py +++ b/python/scripts/run_emscripten_tests.py @@ -114,7 +114,7 @@ def end_headers(self): def run_server_thread(dist_dir, q): - global _SERVER_ADDRESS + global _SERVER_ADDRESS # noqa: F824 os.chdir(dist_dir) server = http.server.HTTPServer(("", 0), TemplateOverrider) q.put(server.server_address)