diff --git a/.cargo/audit.toml b/.cargo/audit.toml index 09e2d35c50..71354ea3a5 100644 --- a/.cargo/audit.toml +++ b/.cargo/audit.toml @@ -33,4 +33,9 @@ ignore = [ # # Introduced by object_store, see https://github.com/apache/arrow-rs-object-store/issues/564 "RUSTSEC-2025-0134", + # `rand` unsoundness with custom logger using `rand::rng()` + # + # Direct dependency upgraded to 0.9.3+. Transitive rand 0.8.5 remains + # from reqsign/sqllogictest/rustc-hash — no 0.8.x patch exists. + "RUSTSEC-2026-0097", ] diff --git a/.github/actions/overwrite-package-version/action.yml b/.github/actions/overwrite-package-version/action.yml index 8a2739456e..aed736ecf9 100644 --- a/.github/actions/overwrite-package-version/action.yml +++ b/.github/actions/overwrite-package-version/action.yml @@ -25,7 +25,7 @@ runs: using: "composite" steps: - name: Setup Python - uses: actions/setup-python@v5 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: '3.12' diff --git a/.github/actions/setup-builder/action.yml b/.github/actions/setup-builder/action.yml index 532174590f..e961ed6335 100644 --- a/.github/actions/setup-builder/action.yml +++ b/.github/actions/setup-builder/action.yml @@ -26,8 +26,8 @@ runs: using: "composite" steps: - name: Setup specified Rust toolchain - shell: bash if: ${{ inputs.rust-version != '' }} + shell: bash env: RUST_VERSION: ${{ inputs.rust-version }} run: | diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 23c9b239ee..03235972dd 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -17,6 +17,15 @@ version: 2 updates: + # Maintain dependencies for GitHub Actions + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + day: "sunday" + cooldown: + default-days: 7 + # Maintain dependencies for iceberg - package-ecosystem: "cargo" directory: "/" @@ -35,3 +44,5 @@ updates: patterns: - "arrow*" - "parquet" + cooldown: + default-days: 7 diff --git a/.github/workflows/asf-allowlist-check.yml b/.github/workflows/asf-allowlist-check.yml new file mode 100644 index 0000000000..65dbe8bcbe --- /dev/null +++ b/.github/workflows/asf-allowlist-check.yml @@ -0,0 +1,46 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Verifies all GitHub Actions refs are on the ASF allowlist. +# Actions not on the allowlist silently fail with "Startup failure" — no logs, +# no notifications, and PRs may appear green because no checks ran. +# See https://github.com/apache/infrastructure-actions/issues/574 +name: "ASF Allowlist Check" + +on: + pull_request: + paths: + - ".github/**" + push: + branches: + - main + paths: + - ".github/**" + +permissions: + contents: read + +jobs: + asf-allowlist-check: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false + - uses: apache/infrastructure-actions/allowlist-check@4e9c961f587f72b170874b6f5cd4ac15f7f26eb8 # main diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index abe0c377c5..3f9865ed8a 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -37,18 +37,23 @@ on: - cron: '0 0 * * *' permissions: + # All other permissions are set to none contents: read + checks: write + issues: write jobs: security_audit: runs-on: ubuntu-latest if: github.repository == 'apache/iceberg-rust' steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: rust-version: stable - - uses: rustsec/audit-check@v2.0.0 + - uses: rustsec/audit-check@69366f33c96575abad1ee0dba8212993eecbe998 # v2.0.0 with: token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/bindings_python_ci.yml b/.github/workflows/bindings_python_ci.yml index efd4a78098..4483a53310 100644 --- a/.github/workflows/bindings_python_ci.yml +++ b/.github/workflows/bindings_python_ci.yml @@ -47,7 +47,9 @@ jobs: check-rust: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - name: Check format working-directory: "bindings/python" run: cargo fmt --all -- --check @@ -58,8 +60,10 @@ jobs: check-python: runs-on: ubuntu-slim steps: - - uses: actions/checkout@v6 - - uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false + - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 with: version: "0.9.3" enable-cache: true @@ -85,16 +89,18 @@ jobs: - macos-latest - windows-latest steps: - - uses: actions/checkout@v6 - - uses: actions/setup-python@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: python-version: 3.12 - - uses: PyO3/maturin-action@v1 + - uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 with: working-directory: "bindings/python" command: build args: --out dist -i python3.12 # Explicitly set interpreter; manylinux containers have multiple Pythons and maturin may pick an older one - - uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 + - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 with: version: "0.9.3" enable-cache: true diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ea2257b676..1949015462 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -53,19 +53,21 @@ jobs: - ubuntu-latest - macos-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Check License Header - uses: apache/skywalking-eyes/header@v0.8.0 + uses: apache/skywalking-eyes/header@61275cc80d0798a405cb070f7d3a8aaf7cf2c2c1 # v0.8.0 - name: Check toml format run: make check-toml - name: Install protoc - uses: arduino/setup-protoc@v3 + uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3 with: repo-token: ${{ secrets.GITHUB_TOKEN }} @@ -91,16 +93,18 @@ jobs: - macos-latest - windows-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Cache Rust artifacts - uses: Swatinem/rust-cache@v2 + uses: swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 - name: Install protoc - uses: arduino/setup-protoc@v3 + uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3 with: repo-token: ${{ secrets.GITHUB_TOKEN }} @@ -117,13 +121,15 @@ jobs: - macos-latest - windows-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Cache Rust artifacts - uses: Swatinem/rust-cache@v2 + uses: swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 - name: Build run: cargo build -p iceberg --no-default-features @@ -138,24 +144,26 @@ jobs: - { name: "doc", args: "--doc --all-features --workspace" } name: Tests (${{ matrix.test-suite.name }}) steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Install protoc - uses: arduino/setup-protoc@v3 + uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3 with: repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Cache Rust artifacts - uses: Swatinem/rust-cache@v2 + uses: swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 with: key: ${{ matrix.test-suite.name }} - name: Install cargo-nextest if: matrix.test-suite.name == 'default' - uses: taiki-e/install-action@v2 + uses: taiki-e/install-action@0abfcd587b70a713fdaa7fb502c885e2112acb15 # v2.75.7 with: tool: cargo-nextest @@ -164,6 +172,7 @@ jobs: run: make docker-up - name: Run tests + shell: bash env: # Disable debug info to speed up compilation and reduce artifact size RUSTFLAGS: "-C debuginfo=0" @@ -182,9 +191,11 @@ jobs: name: Verify MSRV runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - name: Install protoc - uses: arduino/setup-protoc@v3 + uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3 with: repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Get MSRV diff --git a/.github/workflows/ci_typos.yml b/.github/workflows/ci_typos.yml index 8031cd8ca9..089ddfe8e2 100644 --- a/.github/workflows/ci_typos.yml +++ b/.github/workflows/ci_typos.yml @@ -43,6 +43,8 @@ jobs: env: FORCE_COLOR: 1 steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - name: Check typos - uses: crate-ci/typos@v1.44.0 + uses: crate-ci/typos@02ea592e44b3a53c302f697cddca7641cd051c3d # v1.45.0 diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 17bfd8bf3d..7e9c8208c8 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -41,14 +41,16 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v6 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - name: Initialize CodeQL - uses: github/codeql-action/init@v4 + uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2 with: languages: actions - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v4 + uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2 with: category: "/language:actions" diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 26f61118b7..83e1031d17 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -32,6 +32,7 @@ permissions: jobs: publish: runs-on: ubuntu-latest + environment: publish strategy: max-parallel: 1 # Publish package one by one instead of flooding the registry matrix: @@ -46,7 +47,9 @@ jobs: - "crates/catalog/sql" - "crates/integrations/datafusion" steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - name: Get MSRV id: get-msrv @@ -61,6 +64,19 @@ jobs: working-directory: ${{ matrix.package }} # Only publish if it's a tag and the tag is not a pre-release if: ${{ startsWith(github.ref, 'refs/tags/') && !contains(github.ref, '-') }} - run: cargo publish --all-features + run: cargo publish --all-features # zizmor: ignore[use-trusted-publishing] -- https://github.com/apache/iceberg-rust/issues/1539 + shell: bash env: CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} + + # Trigger Python release after crate publishing completes. + # Only runs for tag pushes; for manual Python releases, use workflow_dispatch on release_python.yml directly. + release-python: + needs: [publish] + if: ${{ startsWith(github.ref, 'refs/tags/') }} + permissions: + contents: read + id-token: write # Required for PyPI trusted publishing in the called workflow + uses: ./.github/workflows/release_python.yml + with: + release_tag: ${{ github.ref_name }} diff --git a/.github/workflows/release_python.yml b/.github/workflows/release_python.yml index abf8b52b6d..0638cff6df 100644 --- a/.github/workflows/release_python.yml +++ b/.github/workflows/release_python.yml @@ -18,10 +18,12 @@ name: Publish Python 🐍 distribution 📦 to PyPI on: - workflow_run: - workflows: ["Publish"] # Trigger this workflow after the "publish.yml" workflow completes - types: - - completed + workflow_call: + inputs: + release_tag: + description: 'Release tag (e.g., v0.4.0 or v0.4.0-rc.1)' + required: true + type: string workflow_dispatch: inputs: release_tag: @@ -33,37 +35,24 @@ permissions: contents: read jobs: - check-cargo-publish: - runs-on: ubuntu-latest - # Only run if the triggering workflow succeeded OR if manually triggered - if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }} - steps: - - run: echo 'The Publish workflow passed or was manually triggered' - validate-release-tag: runs-on: ubuntu-latest - needs: [check-cargo-publish] outputs: cargo-version: ${{ steps.validate.outputs.cargo-version }} is-rc: ${{ steps.validate.outputs.is-rc }} steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 if: ${{ github.event_name == 'workflow_dispatch' }} + with: + persist-credentials: false - name: Validate release tag format id: validate - # Use input for workflow_dispatch, otherwise use `workflow_run.head_branch` - # Note, `workflow_run.head_branch` does not contain `refs/tags/` prefix, just the tag name, i.e. `v0.4.0` or `v0.4.0-rc.1` # Valid formats: v.. OR v..-rc. + shell: bash env: - DISPATCH_RELEASE_TAG: ${{ github.event.inputs.release_tag }} - RUN_HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }} + RELEASE_TAG: ${{ inputs.release_tag }} run: | - if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then - RELEASE_TAG="$DISPATCH_RELEASE_TAG" - else - RELEASE_TAG="$RUN_HEAD_BRANCH" - fi echo "Validating release tag: $RELEASE_TAG" if [[ ! "$RELEASE_TAG" =~ ^v[0-9]+\.[0-9]+\.[0-9]+(-rc\.[0-9]+)?$ ]]; then echo "❌ Invalid release tag format: $RELEASE_TAG" @@ -114,7 +103,9 @@ jobs: runs-on: ubuntu-latest needs: [validate-release-tag] steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - name: Install toml-cli if: ${{ needs.validate-release-tag.outputs.is-rc == 'true' }} @@ -124,19 +115,22 @@ jobs: if: ${{ needs.validate-release-tag.outputs.is-rc == 'true' }} working-directory: "bindings/python" run: | - echo "Setting cargo version to: ${{ needs.validate-release-tag.outputs.cargo-version }}" - toml set Cargo.toml package.version ${{ needs.validate-release-tag.outputs.cargo-version }} > Cargo.toml.tmp + echo "Setting cargo version to: ${NEEDS_VALIDATE_RELEASE_TAG_OUTPUTS_CARGO_VERSION}" + toml set Cargo.toml package.version "${NEEDS_VALIDATE_RELEASE_TAG_OUTPUTS_CARGO_VERSION}" > Cargo.toml.tmp # doing this explicitly to avoid issue in Windows where `mv` does not overwrite existing file rm Cargo.toml mv Cargo.toml.tmp Cargo.toml + shell: bash + env: + NEEDS_VALIDATE_RELEASE_TAG_OUTPUTS_CARGO_VERSION: ${{ needs.validate-release-tag.outputs.cargo-version }} - - uses: PyO3/maturin-action@v1 + - uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 with: working-directory: "bindings/python" command: sdist args: -o dist - name: Upload sdist - uses: actions/upload-artifact@v7 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: wheels-sdist path: bindings/python/dist @@ -158,7 +152,9 @@ jobs: } - { os: ubuntu-latest, target: "armv7l" } steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - name: Install toml-cli if: ${{ needs.validate-release-tag.outputs.is-rc == 'true' }} @@ -167,14 +163,17 @@ jobs: - name: Set cargo version for RC if: ${{ needs.validate-release-tag.outputs.is-rc == 'true' }} working-directory: "bindings/python" + shell: bash + env: + CARGO_VERSION: ${{ needs.validate-release-tag.outputs.cargo-version }} run: | - echo "Setting cargo version to: ${{ needs.validate-release-tag.outputs.cargo-version }}" - toml set Cargo.toml package.version ${{ needs.validate-release-tag.outputs.cargo-version }} > Cargo.toml.tmp + echo "Setting cargo version to: $CARGO_VERSION" + toml set Cargo.toml package.version "$CARGO_VERSION" > Cargo.toml.tmp # doing this explicitly to avoid issue in Windows where `mv` does not overwrite existing file rm Cargo.toml mv Cargo.toml.tmp Cargo.toml - - uses: actions/setup-python@v6 + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: python-version: 3.12 - name: Get MSRV @@ -185,7 +184,7 @@ jobs: uses: ./.github/actions/setup-builder with: rust-version: ${{ steps.get-msrv.outputs.msrv }} - - uses: PyO3/maturin-action@v1 + - uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 with: target: ${{ matrix.target }} manylinux: ${{ matrix.manylinux || 'auto' }} @@ -193,7 +192,7 @@ jobs: command: build args: --release -o dist -i python3.12 # Explicitly set interpreter; manylinux containers have multiple Pythons and maturin may pick an older one - name: Upload wheels - uses: actions/upload-artifact@v7 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: wheels-${{ matrix.os }}-${{ matrix.target }} path: bindings/python/dist @@ -212,13 +211,13 @@ jobs: steps: - name: Download all the dists - uses: actions/download-artifact@v8 + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8 with: pattern: wheels-* merge-multiple: true path: bindings/python/dist - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 + uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0 with: skip-existing: true packages-dir: bindings/python/dist diff --git a/.github/workflows/release_python_nightly.yml b/.github/workflows/release_python_nightly.yml index 595cb42d05..26b034554c 100644 --- a/.github/workflows/release_python_nightly.yml +++ b/.github/workflows/release_python_nightly.yml @@ -27,6 +27,7 @@ permissions: jobs: set-version: + if: github.repository == 'apache/iceberg-rust' || github.event_name == 'workflow_dispatch' # Run on schedule for apache repo, or on manual dispatch from any repo runs-on: ubuntu-latest outputs: timestamp: ${{ steps.set-ts.outputs.TIMESTAMP }} @@ -37,30 +38,30 @@ jobs: sdist: needs: set-version - if: github.repository == 'apache/iceberg-rust' # Only run for apache repo runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - uses: ./.github/actions/overwrite-package-version # Overwrite package version with timestamp with: timestamp: ${{ needs.set-version.outputs.TIMESTAMP }} - - uses: PyO3/maturin-action@v1 + - uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 with: working-directory: "bindings/python" command: sdist args: -o dist - name: Upload sdist - uses: actions/upload-artifact@v7 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: wheels-sdist path: bindings/python/dist wheels: needs: set-version - if: github.repository == 'apache/iceberg-rust' # Only run for apache repo runs-on: "${{ matrix.os }}" strategy: max-parallel: 15 @@ -76,13 +77,15 @@ jobs: } - { os: ubuntu-latest, target: "armv7l" } steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - uses: ./.github/actions/overwrite-package-version # Overwrite package version with timestamp with: timestamp: ${{ needs.set-version.outputs.TIMESTAMP }} - - uses: actions/setup-python@v6 + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: python-version: 3.12 @@ -95,7 +98,7 @@ jobs: with: rust-version: ${{ steps.get-msrv.outputs.msrv }} - - uses: PyO3/maturin-action@v1 + - uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 with: target: ${{ matrix.target }} manylinux: ${{ matrix.manylinux || 'auto' }} @@ -104,12 +107,13 @@ jobs: args: --release -o dist -i python3.12 # Explicitly set interpreter; manylinux containers have multiple Pythons and maturin may pick an older one - name: Upload wheels - uses: actions/upload-artifact@v7 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: wheels-${{ matrix.os }}-${{ matrix.target }} path: bindings/python/dist testpypi-publish: + if: github.repository == 'apache/iceberg-rust' # Only run for apache repo needs: [sdist, wheels] runs-on: ubuntu-latest @@ -122,7 +126,7 @@ jobs: steps: - name: Download all the dists - uses: actions/download-artifact@v8 + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8 with: pattern: wheels-* merge-multiple: true @@ -132,7 +136,7 @@ jobs: - name: Publish to TestPyPI id: publish-testpypi continue-on-error: true - uses: pypa/gh-action-pypi-publish@release/v1 + uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0 with: repository-url: https://test.pypi.org/legacy/ skip-existing: true diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index e2afce4c71..c3d3f18294 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -32,7 +32,7 @@ jobs: if: github.repository_owner == 'apache' runs-on: ubuntu-24.04 steps: - - uses: actions/stale@v10.2.0 + - uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f # v10.2.0 with: # stale issues stale-issue-label: 'stale,security' diff --git a/.github/workflows/website.yml b/.github/workflows/website.yml index 59bd2c6f2c..71fb9503c9 100644 --- a/.github/workflows/website.yml +++ b/.github/workflows/website.yml @@ -39,15 +39,17 @@ jobs: permissions: contents: write steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - name: Setup mdBook - uses: peaceiris/actions-mdbook@v2 + uses: peaceiris/actions-mdbook@ee69d230fe19748b7abf22df32acaa93833fad08 # v2 with: mdbook-version: "0.4.36" - name: Install protoc - uses: arduino/setup-protoc@v3 + uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3 with: repo-token: ${{ secrets.GITHUB_TOKEN }} @@ -64,7 +66,7 @@ jobs: cp -r target/doc ./website/book/api - name: Deploy to gh-pages - uses: peaceiris/actions-gh-pages@v4.0.0 + uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0 if: github.event_name == 'push' && github.ref_name == 'main' with: github_token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/zizmor.yml b/.github/workflows/zizmor.yml new file mode 100644 index 0000000000..313835fcbe --- /dev/null +++ b/.github/workflows/zizmor.yml @@ -0,0 +1,44 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: GitHub Actions Security Analysis with zizmor 🌈 + +on: + push: + branches: ["main"] + pull_request: + branches: ["**"] + +permissions: {} + +jobs: + zizmor: + name: Run zizmor 🌈 + runs-on: ubuntu-latest + permissions: {} + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Run zizmor 🌈 + uses: zizmorcore/zizmor-action@71321a20a9ded102f6e9ce5718a2fcec2c4f70d8 # v0.5.2 + with: + advanced-security: false diff --git a/.typos.toml b/.typos.toml index 407ce8168c..36996a553a 100644 --- a/.typos.toml +++ b/.typos.toml @@ -18,5 +18,9 @@ [type.rust] extend-ignore-identifiers-re = ["^bimap$"] +[default.extend-words] +ags = "ags" +AGS = "AGS" + [files] extend-exclude = ["**/testdata", "CHANGELOG.md"] diff --git a/Cargo.lock b/Cargo.lock index a24ef04626..98bdd58fc0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -193,7 +193,7 @@ dependencies = [ "miniz_oxide", "num-bigint", "quad-rand", - "rand 0.9.2", + "rand 0.9.4", "regex-lite", "serde", "serde_bytes", @@ -665,9 +665,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3tables" -version = "1.53.0" +version = "1.54.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91febb29f5287a7b723dbacca6d81b1086b8ac0af6b35b873539ee19c74827f" +checksum = "2e0ec266873694efc365debded01f44e27a0de3946a3ac15d24c489759e5ddf8" dependencies = [ "aws-credential-types", "aws-runtime", @@ -1127,6 +1127,20 @@ name = "bytemuck" version = "1.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "byteorder" @@ -1629,9 +1643,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de9f8117889ba9503440f1dd79ebab32ba52ccf1720bb83cd718a29d4edc0d16" +checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" dependencies = [ "arrow", "arrow-schema", @@ -1673,7 +1687,7 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand 0.9.2", + "rand 0.9.4", "regex", "sqlparser", "tempfile", @@ -1685,9 +1699,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be893b73a13671f310ffcc8da2c546b81efcc54c22e0382c0a28aa3537017137" +checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" dependencies = [ "arrow", "async-trait", @@ -1710,9 +1724,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830487b51ed83807d6b32d6325f349c3144ae0c9bf772cf2a712db180c31d5e6" +checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" dependencies = [ "arrow", "async-trait", @@ -1761,9 +1775,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d7663f3af955292f8004e74bcaf8f7ea3d66cc38438749615bb84815b61a293" +checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" dependencies = [ "ahash", "apache-avro", @@ -1788,9 +1802,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f590205c7e32fe1fea48dd53ffb406e56ae0e7a062213a3ac848db8771641bd" +checksum = "89f4afaed29670ec4fd6053643adc749fe3f4bc9d1ce1b8c5679b22c67d12def" dependencies = [ "futures", "log", @@ -1799,9 +1813,9 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fde1e030a9dc87b743c806fbd631f5ecfa2ccaa4ffb61fa19144a07fea406b79" +checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ "arrow", "async-compression", @@ -1825,7 +1839,7 @@ dependencies = [ "liblzma", "log", "object_store", - "rand 0.9.2", + "rand 0.9.4", "tokio", "tokio-util", "url", @@ -1834,9 +1848,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-arrow" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "331ebae7055dc108f9b54994b93dff91f3a17445539efe5b74e89264f7b36e15" +checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" dependencies = [ "arrow", "arrow-ipc", @@ -1858,9 +1872,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-avro" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49dda81c79b6ba57b1853a9158abc66eb85a3aa1cede0c517dabec6d8a4ed3aa" +checksum = "a579c3bd290c66ea4b269493e75e8a3ed42c9c895a651f10210a29538aee50c4" dependencies = [ "apache-avro", "arrow", @@ -1878,9 +1892,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e0d475088325e2986876aa27bb30d0574f72a22955a527d202f454681d55c5c" +checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" dependencies = [ "arrow", "async-trait", @@ -1901,9 +1915,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea1520d81f31770f3ad6ee98b391e75e87a68a5bb90de70064ace5e0a7182fe8" +checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" dependencies = [ "arrow", "async-trait", @@ -1925,9 +1939,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95be805d0742ab129720f4c51ad9242cd872599cdb076098b03f061fcdc7f946" +checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" dependencies = [ "arrow", "async-trait", @@ -1955,15 +1969,15 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c93ad9e37730d2c7196e68616f3f2dd3b04c892e03acd3a8eeca6e177f3c06a" +checksum = "8de6ac0df1662b9148ad3c987978b32cbec7c772f199b1d53520c8fa764a87ee" [[package]] name = "datafusion-execution" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9437d3cd5d363f9319f8122182d4d233427de79c7eb748f23054c9aaa0fdd8df" +checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" dependencies = [ "arrow", "arrow-buffer", @@ -1978,16 +1992,16 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand 0.9.2", + "rand 0.9.4", "tempfile", "url", ] [[package]] name = "datafusion-expr" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67164333342b86521d6d93fa54081ee39839894fb10f7a700c099af96d7552cf" +checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" dependencies = [ "arrow", "async-trait", @@ -2008,9 +2022,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab05fdd00e05d5a6ee362882546d29d6d3df43a6c55355164a7fbee12d163bc9" +checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" dependencies = [ "arrow", "datafusion-common", @@ -2021,9 +2035,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04fb863482d987cf938db2079e07ab0d3bb64595f28907a6c2f8671ad71cca7e" +checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" dependencies = [ "arrow", "arrow-buffer", @@ -2044,7 +2058,7 @@ dependencies = [ "md-5", "memchr", "num-traits", - "rand 0.9.2", + "rand 0.9.4", "regex", "sha2", "unicode-segmentation", @@ -2053,9 +2067,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "829856f4e14275fb376c104f27cbf3c3b57a9cfe24885d98677525f5e43ce8d6" +checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" dependencies = [ "ahash", "arrow", @@ -2075,9 +2089,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08af79cc3d2aa874a362fb97decfcbd73d687190cb096f16a6c85a7780cce311" +checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" dependencies = [ "ahash", "arrow", @@ -2088,9 +2102,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "465ae3368146d49c2eda3e2c0ef114424c87e8a6b509ab34c1026ace6497e790" +checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" dependencies = [ "arrow", "arrow-ord", @@ -2113,9 +2127,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6156e6b22fcf1784112fc0173f3ae6e78c8fdb4d3ed0eace9543873b437e2af6" +checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" dependencies = [ "arrow", "async-trait", @@ -2129,9 +2143,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca7baec14f866729012efb89011a6973f3a346dc8090c567bfcd328deff551c1" +checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" dependencies = [ "arrow", "datafusion-common", @@ -2147,9 +2161,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "159228c3280d342658466bb556dc24de30047fe1d7e559dc5d16ccc5324166f9" +checksum = "8307bb93519b1a91913723a1130cfafeee3f72200d870d88e91a6fc5470ede5c" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -2157,9 +2171,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5427e5da5edca4d21ea1c7f50e1c9421775fe33d7d5726e5641a833566e7578" +checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", @@ -2168,9 +2182,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89099eefcd5b223ec685c36a41d35c69239236310d71d339f2af0fa4383f3f46" +checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" dependencies = [ "arrow", "chrono", @@ -2188,9 +2202,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f222df5195d605d79098ef37bdd5323bff0131c9d877a24da6ec98dfca9fe36" +checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" dependencies = [ "ahash", "arrow", @@ -2212,9 +2226,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-adapter" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40838625d63d9c12549d81979db3dd675d159055eb9135009ba272ab0e8d0f64" +checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" dependencies = [ "arrow", "datafusion-common", @@ -2227,9 +2241,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eacbcc4cfd502558184ed58fa3c72e775ec65bf077eef5fd2b3453db676f893c" +checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" dependencies = [ "ahash", "arrow", @@ -2244,9 +2258,9 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d501d0e1d0910f015677121601ac177ec59272ef5c9324d1147b394988f40941" +checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" dependencies = [ "arrow", "datafusion-common", @@ -2263,9 +2277,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "463c88ad6f1ecab1810f4c9f046898bee035b370137eb79b2b2db925e270631d" +checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" dependencies = [ "ahash", "arrow", @@ -2295,9 +2309,9 @@ dependencies = [ [[package]] name = "datafusion-pruning" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2857618a0ecbd8cd0cf29826889edd3a25774ec26b2995fc3862095c95d88fc6" +checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" dependencies = [ "arrow", "datafusion-common", @@ -2312,9 +2326,9 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef8637e35022c5c775003b3ab1debc6b4a8f0eb41b069bdd5475dd3aa93f6eba" +checksum = "5412111aa48e2424ba926112e192f7a6b7e4ccb450145d25ce5ede9f19dc491e" dependencies = [ "async-trait", "datafusion-common", @@ -2344,7 +2358,7 @@ dependencies = [ "datafusion-functions-nested", "log", "percent-encoding", - "rand 0.9.2", + "rand 0.9.4", "serde_json", "sha1", "sha2", @@ -2353,9 +2367,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12d9e9f16a1692a11c94bcc418191fa15fd2b4d72a0c1a0c607db93c0b84dd81" +checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" dependencies = [ "arrow", "bigdecimal", @@ -3056,6 +3070,7 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ + "bytemuck", "cfg-if", "crunchy", "num-traits", @@ -3356,11 +3371,12 @@ dependencies = [ "ordered-float 4.6.0", "parquet", "pretty_assertions", - "rand 0.8.5", + "rand 0.9.4", "regex", "reqwest", "roaring", "serde", + "serde_arrow", "serde_bytes", "serde_derive", "serde_json", @@ -3468,14 +3484,19 @@ name = "iceberg-catalog-s3tables" version = "0.9.0" dependencies = [ "anyhow", + "arrow-array", + "arrow-schema", "async-trait", "aws-config", "aws-sdk-s3tables", + "futures", "iceberg", "iceberg-storage-opendal", "iceberg_test_utils", "itertools 0.13.0", + "parquet", "tokio", + "uuid", ] [[package]] @@ -4130,6 +4151,21 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "marrow" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5240d6977234968ff9ad254bfa73aa397fb51e41dcb22b1eb85835e9295485b" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "bytemuck", + "half", + "serde", +] + [[package]] name = "md-5" version = "0.10.6" @@ -4185,9 +4221,9 @@ dependencies = [ [[package]] name = "minijinja" -version = "2.18.0" +version = "2.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "328251e58ad8e415be6198888fc207502727dc77945806421ab34f35bf012e7d" +checksum = "805bfd7352166bae857ee569628b52bcd85a1cecf7810861ebceb1686b72b75d" dependencies = [ "memo-map", "serde", @@ -4205,9 +4241,9 @@ dependencies = [ [[package]] name = "mio" -version = "1.1.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" dependencies = [ "libc", "wasi", @@ -4257,7 +4293,7 @@ dependencies = [ "hyper-util", "log", "pin-project-lite", - "rand 0.9.2", + "rand 0.9.4", "regex", "serde_json", "serde_urlencoded", @@ -4515,7 +4551,7 @@ dependencies = [ "parking_lot", "percent-encoding", "quick-xml 0.39.2", - "rand 0.10.0", + "rand 0.10.1", "reqwest", "ring", "rustls-pki-types", @@ -5171,7 +5207,7 @@ dependencies = [ "bytes", "getrandom 0.3.4", "lru-slab", - "rand 0.9.2", + "rand 0.9.4", "ring", "rustc-hash", "rustls", @@ -5251,9 +5287,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.9.2" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.5", @@ -5261,9 +5297,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" +checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", "getrandom 0.4.2", @@ -5699,9 +5735,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.10" +version = "0.103.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" dependencies = [ "aws-lc-rs", "ring", @@ -5893,6 +5929,21 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_arrow" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2784e59a0315568e850cb01ddadf458f8c09e28d8cfc4880c2cc08f5dc3444e0" +dependencies = [ + "arrow-array", + "arrow-schema", + "bytemuck", + "chrono", + "half", + "marrow", + "serde", +] + [[package]] name = "serde_bytes" version = "0.11.19" @@ -6741,9 +6792,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.50.0" +version = "1.51.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" +checksum = "f66bf9585cda4b724d3e78ab34b73fb2bbaba9011b9bfdf69dc836382ea13b8c" dependencies = [ "bytes", "libc", @@ -6758,9 +6809,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", @@ -7247,7 +7298,7 @@ dependencies = [ "nix 0.29.0", "once_cell", "pin-project", - "rand 0.9.2", + "rand 0.9.4", "socket2 0.5.10", "thiserror 2.0.18", "tokio", diff --git a/Cargo.toml b/Cargo.toml index 778e69c9d9..7f612c44bf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,7 +66,7 @@ cfg-if = "1" chrono = "0.4.41" clap = { version = "4.5.48", features = ["derive", "cargo"] } dashmap = "6" -datafusion = "53.0.0" +datafusion = "53.1.0" datafusion-cli = "53.0.0" datafusion-sqllogictest = "53.0.0" derive_builder = "0.20" @@ -108,7 +108,7 @@ ordered-float = "4" parquet = "58" pilota = "0.11.10" pretty_assertions = "1.4" -rand = "0.8.5" +rand = "0.9.3" regex = "1.11.3" reqwest = { version = "0.12.12", default-features = false, features = ["json"] } roaring = { version = "0.11" } diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index 1b5c06f492..72ea322d7b 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -163,7 +163,7 @@ dependencies = [ "miniz_oxide", "num-bigint", "quad-rand", - "rand 0.9.2", + "rand 0.9.4", "regex-lite", "serde", "serde_bytes", @@ -1052,9 +1052,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de9f8117889ba9503440f1dd79ebab32ba52ccf1720bb83cd718a29d4edc0d16" +checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" dependencies = [ "arrow", "arrow-schema", @@ -1095,7 +1095,7 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand 0.9.2", + "rand 0.9.4", "regex", "sqlparser", "tempfile", @@ -1107,9 +1107,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be893b73a13671f310ffcc8da2c546b81efcc54c22e0382c0a28aa3537017137" +checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" dependencies = [ "arrow", "async-trait", @@ -1132,9 +1132,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830487b51ed83807d6b32d6325f349c3144ae0c9bf772cf2a712db180c31d5e6" +checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" dependencies = [ "arrow", "async-trait", @@ -1155,9 +1155,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d7663f3af955292f8004e74bcaf8f7ea3d66cc38438749615bb84815b61a293" +checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" dependencies = [ "ahash", "arrow", @@ -1180,9 +1180,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f590205c7e32fe1fea48dd53ffb406e56ae0e7a062213a3ac848db8771641bd" +checksum = "89f4afaed29670ec4fd6053643adc749fe3f4bc9d1ce1b8c5679b22c67d12def" dependencies = [ "futures", "log", @@ -1191,9 +1191,9 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fde1e030a9dc87b743c806fbd631f5ecfa2ccaa4ffb61fa19144a07fea406b79" +checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ "arrow", "async-compression", @@ -1217,7 +1217,7 @@ dependencies = [ "liblzma", "log", "object_store", - "rand 0.9.2", + "rand 0.9.4", "tokio", "tokio-util", "url", @@ -1226,9 +1226,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-arrow" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "331ebae7055dc108f9b54994b93dff91f3a17445539efe5b74e89264f7b36e15" +checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" dependencies = [ "arrow", "arrow-ipc", @@ -1250,9 +1250,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e0d475088325e2986876aa27bb30d0574f72a22955a527d202f454681d55c5c" +checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" dependencies = [ "arrow", "async-trait", @@ -1273,9 +1273,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea1520d81f31770f3ad6ee98b391e75e87a68a5bb90de70064ace5e0a7182fe8" +checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" dependencies = [ "arrow", "async-trait", @@ -1297,9 +1297,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95be805d0742ab129720f4c51ad9242cd872599cdb076098b03f061fcdc7f946" +checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" dependencies = [ "arrow", "async-trait", @@ -1327,15 +1327,15 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c93ad9e37730d2c7196e68616f3f2dd3b04c892e03acd3a8eeca6e177f3c06a" +checksum = "8de6ac0df1662b9148ad3c987978b32cbec7c772f199b1d53520c8fa764a87ee" [[package]] name = "datafusion-execution" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9437d3cd5d363f9319f8122182d4d233427de79c7eb748f23054c9aaa0fdd8df" +checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" dependencies = [ "arrow", "arrow-buffer", @@ -1349,16 +1349,16 @@ dependencies = [ "log", "object_store", "parking_lot", - "rand 0.9.2", + "rand 0.9.4", "tempfile", "url", ] [[package]] name = "datafusion-expr" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67164333342b86521d6d93fa54081ee39839894fb10f7a700c099af96d7552cf" +checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" dependencies = [ "arrow", "async-trait", @@ -1379,9 +1379,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab05fdd00e05d5a6ee362882546d29d6d3df43a6c55355164a7fbee12d163bc9" +checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" dependencies = [ "arrow", "datafusion-common", @@ -1422,9 +1422,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04fb863482d987cf938db2079e07ab0d3bb64595f28907a6c2f8671ad71cca7e" +checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" dependencies = [ "arrow", "arrow-buffer", @@ -1445,7 +1445,7 @@ dependencies = [ "md-5", "memchr", "num-traits", - "rand 0.9.2", + "rand 0.9.4", "regex", "sha2", "unicode-segmentation", @@ -1454,9 +1454,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "829856f4e14275fb376c104f27cbf3c3b57a9cfe24885d98677525f5e43ce8d6" +checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" dependencies = [ "ahash", "arrow", @@ -1476,9 +1476,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08af79cc3d2aa874a362fb97decfcbd73d687190cb096f16a6c85a7780cce311" +checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" dependencies = [ "ahash", "arrow", @@ -1489,9 +1489,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "465ae3368146d49c2eda3e2c0ef114424c87e8a6b509ab34c1026ace6497e790" +checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" dependencies = [ "arrow", "arrow-ord", @@ -1514,9 +1514,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6156e6b22fcf1784112fc0173f3ae6e78c8fdb4d3ed0eace9543873b437e2af6" +checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" dependencies = [ "arrow", "async-trait", @@ -1530,9 +1530,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca7baec14f866729012efb89011a6973f3a346dc8090c567bfcd328deff551c1" +checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" dependencies = [ "arrow", "datafusion-common", @@ -1548,9 +1548,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "159228c3280d342658466bb556dc24de30047fe1d7e559dc5d16ccc5324166f9" +checksum = "8307bb93519b1a91913723a1130cfafeee3f72200d870d88e91a6fc5470ede5c" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1558,9 +1558,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5427e5da5edca4d21ea1c7f50e1c9421775fe33d7d5726e5641a833566e7578" +checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", @@ -1569,9 +1569,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89099eefcd5b223ec685c36a41d35c69239236310d71d339f2af0fa4383f3f46" +checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" dependencies = [ "arrow", "chrono", @@ -1589,9 +1589,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f222df5195d605d79098ef37bdd5323bff0131c9d877a24da6ec98dfca9fe36" +checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" dependencies = [ "ahash", "arrow", @@ -1613,9 +1613,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-adapter" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40838625d63d9c12549d81979db3dd675d159055eb9135009ba272ab0e8d0f64" +checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" dependencies = [ "arrow", "datafusion-common", @@ -1628,9 +1628,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eacbcc4cfd502558184ed58fa3c72e775ec65bf077eef5fd2b3453db676f893c" +checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" dependencies = [ "ahash", "arrow", @@ -1645,9 +1645,9 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d501d0e1d0910f015677121601ac177ec59272ef5c9324d1147b394988f40941" +checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" dependencies = [ "arrow", "datafusion-common", @@ -1664,9 +1664,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "463c88ad6f1ecab1810f4c9f046898bee035b370137eb79b2b2db925e270631d" +checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" dependencies = [ "ahash", "arrow", @@ -1719,7 +1719,7 @@ dependencies = [ "datafusion-proto-common", "object_store", "prost", - "rand 0.9.2", + "rand 0.9.4", ] [[package]] @@ -1735,9 +1735,9 @@ dependencies = [ [[package]] name = "datafusion-pruning" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2857618a0ecbd8cd0cf29826889edd3a25774ec26b2995fc3862095c95d88fc6" +checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" dependencies = [ "arrow", "datafusion-common", @@ -1752,9 +1752,9 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef8637e35022c5c775003b3ab1debc6b4a8f0eb41b069bdd5475dd3aa93f6eba" +checksum = "5412111aa48e2424ba926112e192f7a6b7e4ccb450145d25ce5ede9f19dc491e" dependencies = [ "async-trait", "datafusion-common", @@ -1766,9 +1766,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12d9e9f16a1692a11c94bcc418191fa15fd2b4d72a0c1a0c607db93c0b84dd81" +checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" dependencies = [ "arrow", "bigdecimal", @@ -2435,7 +2435,7 @@ dependencies = [ "once_cell", "ordered-float 4.6.0", "parquet", - "rand 0.8.5", + "rand 0.9.4", "reqwest", "roaring", "serde", @@ -3587,7 +3587,7 @@ dependencies = [ "bytes", "getrandom 0.3.4", "lru-slab", - "rand 0.9.2", + "rand 0.9.4", "ring", "rustc-hash", "rustls", @@ -3647,9 +3647,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.9.2" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.5", @@ -3968,9 +3968,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.10" +version = "0.103.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" dependencies = [ "ring", "rustls-pki-types", diff --git a/crates/catalog/glue/src/catalog.rs b/crates/catalog/glue/src/catalog.rs index a7e0171337..5b3ccf3b39 100644 --- a/crates/catalog/glue/src/catalog.rs +++ b/crates/catalog/glue/src/catalog.rs @@ -203,7 +203,6 @@ impl GlueCatalog { // Use provided factory or default to OpenDalStorageFactory::S3 let factory = storage_factory.unwrap_or_else(|| { Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, }) }); diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index f19cf7bff4..d0e6486ad8 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -23,7 +23,10 @@ use std::collections::HashMap; use std::sync::Arc; -use iceberg::io::{FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY}; +use iceberg::io::{ + FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, S3_REGION, + S3_SECRET_ACCESS_KEY, +}; use iceberg::{Catalog, CatalogBuilder, Namespace, NamespaceIdent}; use iceberg_catalog_hms::{ HMS_CATALOG_PROP_THRIFT_TRANSPORT, HMS_CATALOG_PROP_URI, HMS_CATALOG_PROP_WAREHOUSE, @@ -56,11 +59,11 @@ async fn get_catalog() -> HmsCatalog { (S3_ACCESS_KEY_ID.to_string(), "admin".to_string()), (S3_SECRET_ACCESS_KEY.to_string(), "password".to_string()), (S3_REGION.to_string(), "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS.to_string(), "true".to_string()), ]); // Wait for bucket to actually exist let file_io = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, })) .with_props(props.clone()) @@ -79,7 +82,6 @@ async fn get_catalog() -> HmsCatalog { HmsCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, })) .load("hms", props) diff --git a/crates/catalog/loader/tests/common/mod.rs b/crates/catalog/loader/tests/common/mod.rs index 600cd9b6f4..1d40fef357 100644 --- a/crates/catalog/loader/tests/common/mod.rs +++ b/crates/catalog/loader/tests/common/mod.rs @@ -24,8 +24,8 @@ use std::fmt; use std::sync::Arc; use iceberg::io::{ - FileIOBuilder, LocalFsStorageFactory, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, - S3_SECRET_ACCESS_KEY, + FileIOBuilder, LocalFsStorageFactory, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, + S3_REGION, S3_SECRET_ACCESS_KEY, }; use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; use iceberg::spec::{NestedField, PrimitiveType, Schema, Type}; @@ -229,10 +229,10 @@ async fn glue_catalog() -> GlueCatalog { (S3_ACCESS_KEY_ID.to_string(), "admin".to_string()), (S3_SECRET_ACCESS_KEY.to_string(), "password".to_string()), (S3_REGION.to_string(), "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS.to_string(), "true".to_string()), ]); let file_io = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, })) .with_props(props.clone()) @@ -280,10 +280,10 @@ async fn hms_catalog() -> HmsCatalog { (S3_ACCESS_KEY_ID.to_string(), "admin".to_string()), (S3_SECRET_ACCESS_KEY.to_string(), "password".to_string()), (S3_REGION.to_string(), "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS.to_string(), "true".to_string()), ]); let file_io = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, })) .with_props(props.clone()) @@ -300,7 +300,6 @@ async fn hms_catalog() -> HmsCatalog { HmsCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, })) .load("hms", props) diff --git a/crates/catalog/s3tables/Cargo.toml b/crates/catalog/s3tables/Cargo.toml index 2fe096fec9..dc7be3027f 100644 --- a/crates/catalog/s3tables/Cargo.toml +++ b/crates/catalog/s3tables/Cargo.toml @@ -39,6 +39,11 @@ iceberg-storage-opendal = { workspace = true, features = ["opendal-s3"] } [dev-dependencies] +arrow-array = { workspace = true } +arrow-schema = { workspace = true } +futures = { workspace = true } iceberg_test_utils = { path = "../../test_utils", features = ["tests"] } itertools = { workspace = true } +parquet = { workspace = true } tokio = { workspace = true } +uuid = { workspace = true } diff --git a/crates/catalog/s3tables/src/catalog.rs b/crates/catalog/s3tables/src/catalog.rs index b88bd77d29..cc43446943 100644 --- a/crates/catalog/s3tables/src/catalog.rs +++ b/crates/catalog/s3tables/src/catalog.rs @@ -202,7 +202,6 @@ impl S3TablesCatalog { // Use provided factory or default to OpenDalStorageFactory::S3 let factory = storage_factory.unwrap_or_else(|| { Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, }) }); @@ -707,6 +706,7 @@ where T: std::fmt::Debug { #[cfg(test)] mod tests { + use futures::TryStreamExt; use iceberg::spec::{NestedField, PrimitiveType, Schema, Type}; use iceberg::transaction::{ApplyTransactionAction, Transaction}; @@ -1175,4 +1175,108 @@ mod tests { assert_eq!(err.message(), "Catalog name cannot be empty"); } } + + /// Verify that an S3 Table catalog can create a table, write data, load the same table, and read from it. + #[tokio::test] + async fn test_s3tables_create_table_write_load_table_read() { + use iceberg::writer::base_writer::data_file_writer::DataFileWriterBuilder; + use iceberg::writer::file_writer::ParquetWriterBuilder; + use iceberg::writer::file_writer::location_generator::{ + DefaultFileNameGenerator, DefaultLocationGenerator, + }; + use iceberg::writer::file_writer::rolling_writer::RollingFileWriterBuilder; + use iceberg::writer::{IcebergWriter, IcebergWriterBuilder}; + + let catalog = match load_s3tables_catalog_from_env().await { + Ok(Some(c)) => c, + Ok(None) => return, + Err(e) => panic!("Error loading catalog: {e}"), + }; + + let ns = NamespaceIdent::new(format!("test_rw_{}", uuid::Uuid::new_v4().simple())); + catalog.create_namespace(&ns, HashMap::new()).await.unwrap(); + + let table_name = String::from("table"); + + let schema = Schema::builder() + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(); + let creation = TableCreation::builder() + .name(table_name.clone()) + .schema(schema) + .build(); + + let table = catalog.create_table(&ns, creation).await.unwrap(); + + // Write one row. + let arrow_schema: Arc = Arc::new( + table + .metadata() + .current_schema() + .as_ref() + .try_into() + .unwrap(), + ); + let batch = arrow_array::RecordBatch::try_new(arrow_schema, vec![Arc::new( + arrow_array::Int32Array::from(vec![42]), + )]) + .unwrap(); + + // Locations will be generated based on the table metadata, which will be using `s3://` for Amazon S3 Tables. + let location_generator = DefaultLocationGenerator::new(table.metadata().clone()).unwrap(); + let file_name_generator = DefaultFileNameGenerator::new( + "test".to_string(), + None, + iceberg::spec::DataFileFormat::Parquet, + ); + let parquet_writer_builder = ParquetWriterBuilder::new( + parquet::file::properties::WriterProperties::default(), + table.metadata().current_schema().clone(), + ); + let rw = RollingFileWriterBuilder::new_with_default_file_size( + parquet_writer_builder, + table.file_io().clone(), + location_generator, + file_name_generator, + ); + let mut writer = DataFileWriterBuilder::new(rw).build(None).await.unwrap(); + writer.write(batch.clone()).await.unwrap(); + let data_files = writer.close().await.unwrap(); + + let tx = Transaction::new(&table); + let tx = tx + .fast_append() + .add_data_files(data_files) + .apply(tx) + .unwrap(); + tx.commit(&catalog).await.unwrap(); + + // Reload from catalog and read back. + let table_ident = TableIdent::new(ns.clone(), table_name.clone()); + let reloaded = catalog.load_table(&table_ident).await.unwrap(); + let batches: Vec = reloaded + .scan() + .select_all() + .build() + .expect("scan to be valid (snapshot exists, schema is OK)") + .to_arrow() + .await + .expect("scan tasks should be OK") + .try_collect() + .await + .expect("scan should complete successfully"); + + assert_eq!(batches.len(), 1); + assert_eq!( + batches[0], batch, + "read records should match records written earlier" + ); + + // Clean up. + catalog.purge_table(&table_ident).await.ok(); + catalog.drop_namespace(&ns).await.ok(); + } } diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml index aa1d0cd4a5..18729176dc 100644 --- a/crates/iceberg/Cargo.toml +++ b/crates/iceberg/Cargo.toml @@ -91,6 +91,7 @@ rand = { workspace = true } regex = { workspace = true } tempfile = { workspace = true } minijinja = { workspace = true } +serde_arrow = { version = "0.14", features = ["arrow-58"] } [package.metadata.cargo-machete] # These dependencies are added to ensure minimal dependency version diff --git a/crates/iceberg/src/arrow/int96.rs b/crates/iceberg/src/arrow/int96.rs new file mode 100644 index 0000000000..63a7a30f1a --- /dev/null +++ b/crates/iceberg/src/arrow/int96.rs @@ -0,0 +1,578 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! INT96 timestamp coercion for Parquet files. + +use std::sync::Arc; + +use arrow_schema::{ + DataType, Field, Fields, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, TimeUnit, +}; +use parquet::arrow::PARQUET_FIELD_ID_META_KEY; + +use crate::arrow::schema::{ArrowSchemaVisitor, DEFAULT_MAP_FIELD_NAME, visit_schema}; +use crate::error::Result; +use crate::spec::{PrimitiveType, Schema, Type}; +use crate::{Error, ErrorKind}; + +/// Coerce Arrow schema types for INT96 columns to match the Iceberg table schema. +/// +/// arrow-rs defaults INT96 to `Timestamp(Nanosecond)`, which overflows i64 for dates outside +/// ~1677-2262. We use arrow-rs's schema hint mechanism to read INT96 at the resolution +/// specified by the Iceberg schema (`timestamp` → microsecond, `timestamp_ns` → nanosecond). +/// +/// Iceberg Java handles this differently: it bypasses parquet-mr with a custom column reader +/// (`GenericParquetReaders.TimestampInt96Reader`). We achieve the same result via schema hints. +/// +/// References: +/// - Iceberg spec primitive types: +/// - arrow-rs schema hint support: +pub(crate) fn coerce_int96_timestamps( + arrow_schema: &ArrowSchemaRef, + iceberg_schema: &Schema, +) -> Option> { + let mut visitor = Int96CoercionVisitor::new(iceberg_schema); + let coerced = visit_schema(arrow_schema, &mut visitor).ok()?; + if visitor.changed { + Some(Arc::new(coerced)) + } else { + None + } +} + +/// Visitor that coerces `Timestamp(Nanosecond)` Arrow fields to the resolution +/// indicated by the Iceberg schema. +struct Int96CoercionVisitor<'a> { + iceberg_schema: &'a Schema, + // TODO(#2310): use FieldRef (Arc) once ArrowSchemaVisitor passes FieldRef. + field_stack: Vec, + changed: bool, +} + +impl<'a> Int96CoercionVisitor<'a> { + fn new(iceberg_schema: &'a Schema) -> Self { + Self { + iceberg_schema, + field_stack: Vec::new(), + changed: false, + } + } + + /// Determine the target TimeUnit for a Timestamp(Nanosecond) field based on the + /// Iceberg schema. Falls back to microsecond when field IDs are unavailable, + /// matching Iceberg Java behavior. + fn target_unit(&self, field: &Field) -> Option { + if !matches!( + field.data_type(), + DataType::Timestamp(TimeUnit::Nanosecond, _) + ) { + return None; + } + + let target = field + .metadata() + .get(PARQUET_FIELD_ID_META_KEY) + .and_then(|id_str| id_str.parse::().ok()) + .and_then(|field_id| self.iceberg_schema.field_by_id(field_id)) + .and_then(|f| match &*f.field_type { + Type::Primitive(PrimitiveType::Timestamp | PrimitiveType::Timestamptz) => { + Some(TimeUnit::Microsecond) + } + Type::Primitive(PrimitiveType::TimestampNs | PrimitiveType::TimestamptzNs) => { + Some(TimeUnit::Nanosecond) + } + _ => None, + }) + // Iceberg Java reads INT96 as microseconds by default + .unwrap_or(TimeUnit::Microsecond); + + if target == TimeUnit::Nanosecond { + None + } else { + Some(target) + } + } +} + +impl ArrowSchemaVisitor for Int96CoercionVisitor<'_> { + type T = Field; + type U = ArrowSchema; + + fn before_field(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn after_field(&mut self, _field: &Field) -> Result<()> { + self.field_stack.pop(); + Ok(()) + } + + fn before_list_element(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn after_list_element(&mut self, _field: &Field) -> Result<()> { + self.field_stack.pop(); + Ok(()) + } + + fn before_map_key(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn after_map_key(&mut self, _field: &Field) -> Result<()> { + self.field_stack.pop(); + Ok(()) + } + + fn before_map_value(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn after_map_value(&mut self, _field: &Field) -> Result<()> { + self.field_stack.pop(); + Ok(()) + } + + fn schema(&mut self, schema: &ArrowSchema, values: Vec) -> Result { + Ok(ArrowSchema::new_with_metadata( + values, + schema.metadata().clone(), + )) + } + + fn r#struct(&mut self, _fields: &Fields, results: Vec) -> Result { + let field_info = self + .field_stack + .last() + .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in struct"))?; + Ok(Field::new( + field_info.name(), + DataType::Struct(Fields::from(results)), + field_info.is_nullable(), + ) + .with_metadata(field_info.metadata().clone())) + } + + fn list(&mut self, list: &DataType, value: Field) -> Result { + let field_info = self + .field_stack + .last() + .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in list"))?; + let list_type = match list { + DataType::List(_) => DataType::List(Arc::new(value)), + DataType::LargeList(_) => DataType::LargeList(Arc::new(value)), + DataType::FixedSizeList(_, size) => DataType::FixedSizeList(Arc::new(value), *size), + _ => { + return Err(Error::new( + ErrorKind::Unexpected, + format!("Expected list type, got {list}"), + )); + } + }; + Ok( + Field::new(field_info.name(), list_type, field_info.is_nullable()) + .with_metadata(field_info.metadata().clone()), + ) + } + + fn map(&mut self, map: &DataType, key_value: Field, value: Field) -> Result { + let field_info = self + .field_stack + .last() + .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in map"))?; + let sorted = match map { + DataType::Map(_, sorted) => *sorted, + _ => { + return Err(Error::new( + ErrorKind::Unexpected, + format!("Expected map type, got {map}"), + )); + } + }; + let struct_field = Field::new( + DEFAULT_MAP_FIELD_NAME, + DataType::Struct(Fields::from(vec![key_value, value])), + false, + ); + Ok(Field::new( + field_info.name(), + DataType::Map(Arc::new(struct_field), sorted), + field_info.is_nullable(), + ) + .with_metadata(field_info.metadata().clone())) + } + + fn primitive(&mut self, p: &DataType) -> Result { + let field_info = self.field_stack.last().ok_or_else(|| { + Error::new(ErrorKind::Unexpected, "Field stack underflow in primitive") + })?; + + if let Some(target_unit) = self.target_unit(field_info) { + let tz = match field_info.data_type() { + DataType::Timestamp(_, tz) => tz.clone(), + _ => None, + }; + self.changed = true; + Ok(Field::new( + field_info.name(), + DataType::Timestamp(target_unit, tz), + field_info.is_nullable(), + ) + .with_metadata(field_info.metadata().clone())) + } else { + Ok( + Field::new(field_info.name(), p.clone(), field_info.is_nullable()) + .with_metadata(field_info.metadata().clone()), + ) + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; + + use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; + use parquet::arrow::PARQUET_FIELD_ID_META_KEY; + + use super::coerce_int96_timestamps; + use crate::spec::{ListType, MapType, NestedField, PrimitiveType, Schema, StructType, Type}; + + fn iceberg_schema_with_timestamp() -> Schema { + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)).into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap() + } + + fn field_id_meta(id: i32) -> HashMap { + HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), id.to_string())]) + } + + #[test] + fn test_coerce_timestamp_ns_to_us() { + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(field_id_meta(1)), + Field::new("id", DataType::Int32, false).with_metadata(field_id_meta(2)), + ])); + let iceberg = iceberg_schema_with_timestamp(); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!( + coerced.field(0).data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + // Non-timestamp field unchanged + assert_eq!(coerced.field(1).data_type(), &DataType::Int32); + } + + #[test] + fn test_coerce_timestamptz_ns_to_us() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamptz)).into(), + ]) + .build() + .unwrap(); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "ts", + DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())), + true, + ) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!( + coerced.field(0).data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())) + ); + } + + #[test] + fn test_no_coercion_when_iceberg_is_timestamp_ns() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::TimestampNs)).into(), + ]) + .build() + .unwrap(); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(field_id_meta(1)), + ])); + + assert!(coerce_int96_timestamps(&arrow_schema, &iceberg).is_none()); + } + + #[test] + fn test_no_coercion_when_iceberg_is_timestamptz_ns() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::TimestamptzNs)) + .into(), + ]) + .build() + .unwrap(); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "ts", + DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())), + true, + ) + .with_metadata(field_id_meta(1)), + ])); + + assert!(coerce_int96_timestamps(&arrow_schema, &iceberg).is_none()); + } + + #[test] + fn test_no_coercion_when_already_microsecond() { + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Microsecond, None), true) + .with_metadata(field_id_meta(1)), + Field::new("id", DataType::Int32, false).with_metadata(field_id_meta(2)), + ])); + let iceberg = iceberg_schema_with_timestamp(); + + assert!(coerce_int96_timestamps(&arrow_schema, &iceberg).is_none()); + } + + // Without field IDs, the visitor can't look up the Iceberg type and falls back + // to microsecond to match Iceberg Java behavior. + #[test] + fn test_defaults_to_us_without_field_ids() { + let arrow_schema = Arc::new(ArrowSchema::new(vec![Field::new( + "ts", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + )])); + let iceberg = iceberg_schema_with_timestamp(); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!( + coerced.field(0).data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + } + + // Field ID exists but points to a non-timestamp Iceberg type. The field_by_id + // lookup succeeds but the match arm returns None, so unwrap_or falls back to + // microsecond. + #[test] + fn test_defaults_to_us_when_iceberg_type_is_not_timestamp() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!( + coerced.field(0).data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + } + + #[test] + fn test_coerce_preserves_field_metadata() { + let mut meta = field_id_meta(1); + meta.insert("custom_key".to_string(), "custom_value".to_string()); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(meta.clone()), + ])); + let iceberg = iceberg_schema_with_timestamp(); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!(coerced.field(0).metadata(), &meta); + } + + #[test] + fn test_coerce_timestamp_in_struct() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required( + 1, + "data", + Type::Struct(StructType::new(vec![ + NestedField::optional(2, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + ])), + ) + .into(), + ]) + .build() + .unwrap(); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "data", + DataType::Struct( + vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(field_id_meta(2)), + ] + .into(), + ), + false, + ) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + let inner = match coerced.field(0).data_type() { + DataType::Struct(fields) => fields, + other => panic!("Expected Struct, got {other}"), + }; + assert_eq!( + inner[0].data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + } + + #[test] + fn test_coerce_timestamp_in_list() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "timestamps", + Type::List(ListType { + element_field: NestedField::optional( + 2, + "element", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(); + + let element_field = Field::new( + "element", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ) + .with_metadata(field_id_meta(2)); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("timestamps", DataType::List(Arc::new(element_field)), true) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + let element_dt = match coerced.field(0).data_type() { + DataType::List(f) => f.data_type(), + other => panic!("Expected List, got {other}"), + }; + assert_eq!( + element_dt, + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + } + + #[test] + fn test_coerce_timestamp_in_map_value() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "ts_map", + Type::Map(MapType { + key_field: NestedField::required( + 2, + "key", + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::optional( + 3, + "value", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(); + + let key_field = Field::new("key", DataType::Utf8, false).with_metadata(field_id_meta(2)); + let value_field = Field::new( + "value", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ) + .with_metadata(field_id_meta(3)); + let entries_field = Field::new( + "key_value", + DataType::Struct(vec![key_field, value_field].into()), + false, + ); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "ts_map", + DataType::Map(Arc::new(entries_field), false), + true, + ) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + let value_dt = match coerced.field(0).data_type() { + DataType::Map(entries, _) => match entries.data_type() { + DataType::Struct(fields) => fields[1].data_type().clone(), + other => panic!("Expected Struct inside Map, got {other}"), + }, + other => panic!("Expected Map, got {other}"), + }; + assert_eq!(value_dt, DataType::Timestamp(TimeUnit::Microsecond, None)); + } +} diff --git a/crates/iceberg/src/arrow/mod.rs b/crates/iceberg/src/arrow/mod.rs index c091c45177..7823320452 100644 --- a/crates/iceberg/src/arrow/mod.rs +++ b/crates/iceberg/src/arrow/mod.rs @@ -27,6 +27,7 @@ pub(crate) mod caching_delete_file_loader; pub mod delete_file_loader; pub(crate) mod delete_filter; +mod int96; mod reader; /// RecordBatch projection utilities pub mod record_batch_projector; diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs deleted file mode 100644 index 042a730e19..0000000000 --- a/crates/iceberg/src/arrow/reader.rs +++ /dev/null @@ -1,4670 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Parquet file data reader - -use std::collections::{HashMap, HashSet}; -use std::ops::Range; -use std::str::FromStr; -use std::sync::Arc; - -use arrow_arith::boolean::{and, and_kleene, is_not_null, is_null, not, or, or_kleene}; -use arrow_array::{Array, ArrayRef, BooleanArray, Datum as ArrowDatum, RecordBatch, Scalar}; -use arrow_cast::cast::cast; -use arrow_ord::cmp::{eq, gt, gt_eq, lt, lt_eq, neq}; -use arrow_schema::{ - ArrowError, DataType, FieldRef, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, -}; -use arrow_string::like::starts_with; -use bytes::Bytes; -use fnv::FnvHashSet; -use futures::future::BoxFuture; -use futures::{FutureExt, StreamExt, TryFutureExt, TryStreamExt}; -use parquet::arrow::arrow_reader::{ - ArrowPredicateFn, ArrowReaderMetadata, ArrowReaderOptions, RowFilter, RowSelection, RowSelector, -}; -use parquet::arrow::async_reader::AsyncFileReader; -use parquet::arrow::{PARQUET_FIELD_ID_META_KEY, ParquetRecordBatchStreamBuilder, ProjectionMask}; -use parquet::file::metadata::{ - PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData, -}; -use parquet::schema::types::{SchemaDescriptor, Type as ParquetType}; -use typed_builder::TypedBuilder; - -use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; -use crate::arrow::record_batch_transformer::RecordBatchTransformerBuilder; -use crate::arrow::{arrow_schema_to_schema, get_arrow_datum}; -use crate::delete_vector::DeleteVector; -use crate::error::Result; -use crate::expr::visitors::bound_predicate_visitor::{BoundPredicateVisitor, visit}; -use crate::expr::visitors::page_index_evaluator::PageIndexEvaluator; -use crate::expr::visitors::row_group_metrics_evaluator::RowGroupMetricsEvaluator; -use crate::expr::{BoundPredicate, BoundReference}; -use crate::io::{FileIO, FileMetadata, FileRead}; -use crate::metadata_columns::{RESERVED_FIELD_ID_FILE, is_metadata_field}; -use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream}; -use crate::spec::{Datum, NameMapping, NestedField, PrimitiveType, Schema, Type}; -use crate::utils::available_parallelism; -use crate::{Error, ErrorKind}; - -/// Default gap between byte ranges below which they are coalesced into a -/// single request. Matches object_store's `OBJECT_STORE_COALESCE_DEFAULT`. -const DEFAULT_RANGE_COALESCE_BYTES: u64 = 1024 * 1024; - -/// Default maximum number of coalesced byte ranges fetched concurrently. -/// Matches object_store's `OBJECT_STORE_COALESCE_PARALLEL`. -const DEFAULT_RANGE_FETCH_CONCURRENCY: usize = 10; - -/// Default number of bytes to prefetch when parsing Parquet footer metadata. -/// Matches DataFusion's default `ParquetOptions::metadata_size_hint`. -const DEFAULT_METADATA_SIZE_HINT: usize = 512 * 1024; - -/// Options for tuning Parquet file I/O. -#[derive(Clone, Copy, Debug, TypedBuilder)] -#[builder(field_defaults(setter(prefix = "with_")))] -pub(crate) struct ParquetReadOptions { - /// Number of bytes to prefetch for parsing the Parquet metadata. - /// - /// This hint can help reduce the number of fetch requests. For more details see the - /// [ParquetMetaDataReader documentation](https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataReader.html#method.with_prefetch_hint). - /// - /// Defaults to 512 KiB, matching DataFusion's default `ParquetOptions::metadata_size_hint`. - #[builder(default = Some(DEFAULT_METADATA_SIZE_HINT))] - pub(crate) metadata_size_hint: Option, - /// Gap threshold for merging nearby byte ranges into a single request. - /// Ranges with gaps smaller than this value will be coalesced. - /// - /// Defaults to 1 MiB, matching object_store's `OBJECT_STORE_COALESCE_DEFAULT`. - #[builder(default = DEFAULT_RANGE_COALESCE_BYTES)] - pub(crate) range_coalesce_bytes: u64, - /// Maximum number of merged byte ranges to fetch concurrently. - /// - /// Defaults to 10, matching object_store's `OBJECT_STORE_COALESCE_PARALLEL`. - #[builder(default = DEFAULT_RANGE_FETCH_CONCURRENCY)] - pub(crate) range_fetch_concurrency: usize, - /// Whether to preload the column index when reading Parquet metadata. - #[builder(default = true)] - pub(crate) preload_column_index: bool, - /// Whether to preload the offset index when reading Parquet metadata. - #[builder(default = true)] - pub(crate) preload_offset_index: bool, - /// Whether to preload the page index when reading Parquet metadata. - #[builder(default = false)] - pub(crate) preload_page_index: bool, -} - -impl ParquetReadOptions { - pub(crate) fn metadata_size_hint(&self) -> Option { - self.metadata_size_hint - } - - pub(crate) fn range_coalesce_bytes(&self) -> u64 { - self.range_coalesce_bytes - } - - pub(crate) fn range_fetch_concurrency(&self) -> usize { - self.range_fetch_concurrency - } - - pub(crate) fn preload_column_index(&self) -> bool { - self.preload_column_index - } - - pub(crate) fn preload_offset_index(&self) -> bool { - self.preload_offset_index - } - - pub(crate) fn preload_page_index(&self) -> bool { - self.preload_page_index - } -} - -/// Builder to create ArrowReader -pub struct ArrowReaderBuilder { - batch_size: Option, - file_io: FileIO, - concurrency_limit_data_files: usize, - row_group_filtering_enabled: bool, - row_selection_enabled: bool, - parquet_read_options: ParquetReadOptions, -} - -impl ArrowReaderBuilder { - /// Create a new ArrowReaderBuilder - pub fn new(file_io: FileIO) -> Self { - let num_cpus = available_parallelism().get(); - - ArrowReaderBuilder { - batch_size: None, - file_io, - concurrency_limit_data_files: num_cpus, - row_group_filtering_enabled: true, - row_selection_enabled: false, - parquet_read_options: ParquetReadOptions::builder().build(), - } - } - - /// Sets the max number of in flight data files that are being fetched - pub fn with_data_file_concurrency_limit(mut self, val: usize) -> Self { - self.concurrency_limit_data_files = val; - self - } - - /// Sets the desired size of batches in the response - /// to something other than the default - pub fn with_batch_size(mut self, batch_size: usize) -> Self { - self.batch_size = Some(batch_size); - self - } - - /// Determines whether to enable row group filtering. - pub fn with_row_group_filtering_enabled(mut self, row_group_filtering_enabled: bool) -> Self { - self.row_group_filtering_enabled = row_group_filtering_enabled; - self - } - - /// Determines whether to enable row selection. - pub fn with_row_selection_enabled(mut self, row_selection_enabled: bool) -> Self { - self.row_selection_enabled = row_selection_enabled; - self - } - - /// Provide a hint as to the number of bytes to prefetch for parsing the Parquet metadata - /// - /// This hint can help reduce the number of fetch requests. For more details see the - /// [ParquetMetaDataReader documentation](https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataReader.html#method.with_prefetch_hint). - pub fn with_metadata_size_hint(mut self, metadata_size_hint: usize) -> Self { - self.parquet_read_options.metadata_size_hint = Some(metadata_size_hint); - self - } - - /// Sets the gap threshold for merging nearby byte ranges into a single request. - /// Ranges with gaps smaller than this value will be coalesced. - /// - /// Defaults to 1 MiB, matching object_store's OBJECT_STORE_COALESCE_DEFAULT. - pub fn with_range_coalesce_bytes(mut self, range_coalesce_bytes: u64) -> Self { - self.parquet_read_options.range_coalesce_bytes = range_coalesce_bytes; - self - } - - /// Sets the maximum number of merged byte ranges to fetch concurrently. - /// - /// Defaults to 10, matching object_store's OBJECT_STORE_COALESCE_PARALLEL. - pub fn with_range_fetch_concurrency(mut self, range_fetch_concurrency: usize) -> Self { - self.parquet_read_options.range_fetch_concurrency = range_fetch_concurrency; - self - } - - /// Build the ArrowReader. - pub fn build(self) -> ArrowReader { - ArrowReader { - batch_size: self.batch_size, - file_io: self.file_io.clone(), - delete_file_loader: CachingDeleteFileLoader::new( - self.file_io.clone(), - self.concurrency_limit_data_files, - ), - concurrency_limit_data_files: self.concurrency_limit_data_files, - row_group_filtering_enabled: self.row_group_filtering_enabled, - row_selection_enabled: self.row_selection_enabled, - parquet_read_options: self.parquet_read_options, - } - } -} - -/// Reads data from Parquet files -#[derive(Clone)] -pub struct ArrowReader { - batch_size: Option, - file_io: FileIO, - delete_file_loader: CachingDeleteFileLoader, - - /// the maximum number of data files that can be fetched at the same time - concurrency_limit_data_files: usize, - - row_group_filtering_enabled: bool, - row_selection_enabled: bool, - parquet_read_options: ParquetReadOptions, -} - -impl ArrowReader { - /// Take a stream of FileScanTasks and reads all the files. - /// Returns a stream of Arrow RecordBatches containing the data from the files - pub fn read(self, tasks: FileScanTaskStream) -> Result { - let file_io = self.file_io.clone(); - let batch_size = self.batch_size; - let concurrency_limit_data_files = self.concurrency_limit_data_files; - let row_group_filtering_enabled = self.row_group_filtering_enabled; - let row_selection_enabled = self.row_selection_enabled; - let parquet_read_options = self.parquet_read_options; - - // Fast-path for single concurrency to avoid overhead of try_flatten_unordered - let stream: ArrowRecordBatchStream = if concurrency_limit_data_files == 1 { - Box::pin( - tasks - .and_then(move |task| { - let file_io = file_io.clone(); - - Self::process_file_scan_task( - task, - batch_size, - file_io, - self.delete_file_loader.clone(), - row_group_filtering_enabled, - row_selection_enabled, - parquet_read_options, - ) - }) - .map_err(|err| { - Error::new(ErrorKind::Unexpected, "file scan task generate failed") - .with_source(err) - }) - .try_flatten(), - ) - } else { - Box::pin( - tasks - .map_ok(move |task| { - let file_io = file_io.clone(); - - Self::process_file_scan_task( - task, - batch_size, - file_io, - self.delete_file_loader.clone(), - row_group_filtering_enabled, - row_selection_enabled, - parquet_read_options, - ) - }) - .map_err(|err| { - Error::new(ErrorKind::Unexpected, "file scan task generate failed") - .with_source(err) - }) - .try_buffer_unordered(concurrency_limit_data_files) - .try_flatten_unordered(concurrency_limit_data_files), - ) - }; - - Ok(stream) - } - - async fn process_file_scan_task( - task: FileScanTask, - batch_size: Option, - file_io: FileIO, - delete_file_loader: CachingDeleteFileLoader, - row_group_filtering_enabled: bool, - row_selection_enabled: bool, - parquet_read_options: ParquetReadOptions, - ) -> Result { - let should_load_page_index = - (row_selection_enabled && task.predicate.is_some()) || !task.deletes.is_empty(); - let mut parquet_read_options = parquet_read_options; - parquet_read_options.preload_page_index = should_load_page_index; - - let delete_filter_rx = - delete_file_loader.load_deletes(&task.deletes, Arc::clone(&task.schema)); - - // Open the Parquet file once, loading its metadata - let (parquet_file_reader, arrow_metadata) = Self::open_parquet_file( - &task.data_file_path, - &file_io, - task.file_size_in_bytes, - parquet_read_options, - ) - .await?; - - // Check if Parquet file has embedded field IDs - // Corresponds to Java's ParquetSchemaUtil.hasIds() - // Reference: parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java:118 - let missing_field_ids = arrow_metadata - .schema() - .fields() - .iter() - .next() - .is_some_and(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()); - - // Three-branch schema resolution strategy matching Java's ReadConf constructor - // - // Per Iceberg spec Column Projection rules: - // "Columns in Iceberg data files are selected by field id. The table schema's column - // names and order may change after a data file is written, and projection must be done - // using field ids." - // https://iceberg.apache.org/spec/#column-projection - // - // When Parquet files lack field IDs (e.g., Hive/Spark migrations via add_files), - // we must assign field IDs BEFORE reading data to enable correct projection. - // - // Java's ReadConf determines field ID strategy: - // - Branch 1: hasIds(fileSchema) → trust embedded field IDs, use pruneColumns() - // - Branch 2: nameMapping present → applyNameMapping(), then pruneColumns() - // - Branch 3: fallback → addFallbackIds(), then pruneColumnsFallback() - let arrow_metadata = if missing_field_ids { - // Parquet file lacks field IDs - must assign them before reading - let arrow_schema = if let Some(name_mapping) = &task.name_mapping { - // Branch 2: Apply name mapping to assign correct Iceberg field IDs - // Per spec rule #2: "Use schema.name-mapping.default metadata to map field id - // to columns without field id" - // Corresponds to Java's ParquetSchemaUtil.applyNameMapping() - apply_name_mapping_to_arrow_schema( - Arc::clone(arrow_metadata.schema()), - name_mapping, - )? - } else { - // Branch 3: No name mapping - use position-based fallback IDs - // Corresponds to Java's ParquetSchemaUtil.addFallbackIds() - add_fallback_field_ids_to_arrow_schema(arrow_metadata.schema()) - }; - - let options = ArrowReaderOptions::new().with_schema(arrow_schema); - ArrowReaderMetadata::try_new(Arc::clone(arrow_metadata.metadata()), options).map_err( - |e| { - Error::new( - ErrorKind::Unexpected, - "Failed to create ArrowReaderMetadata with field ID schema", - ) - .with_source(e) - }, - )? - } else { - // Branch 1: File has embedded field IDs - trust them - arrow_metadata - }; - - // Build the stream reader, reusing the already-opened file reader - let mut record_batch_stream_builder = - ParquetRecordBatchStreamBuilder::new_with_metadata(parquet_file_reader, arrow_metadata); - - // Filter out metadata fields for Parquet projection (they don't exist in files) - let project_field_ids_without_metadata: Vec = task - .project_field_ids - .iter() - .filter(|&&id| !is_metadata_field(id)) - .copied() - .collect(); - - // Create projection mask based on field IDs - // - If file has embedded IDs: field-ID-based projection (missing_field_ids=false) - // - If name mapping applied: field-ID-based projection (missing_field_ids=true but IDs now match) - // - If fallback IDs: position-based projection (missing_field_ids=true) - let projection_mask = Self::get_arrow_projection_mask( - &project_field_ids_without_metadata, - &task.schema, - record_batch_stream_builder.parquet_schema(), - record_batch_stream_builder.schema(), - missing_field_ids, // Whether to use position-based (true) or field-ID-based (false) projection - )?; - - record_batch_stream_builder = - record_batch_stream_builder.with_projection(projection_mask.clone()); - - // RecordBatchTransformer performs any transformations required on the RecordBatches - // that come back from the file, such as type promotion, default column insertion, - // column re-ordering, partition constants, and virtual field addition (like _file) - let mut record_batch_transformer_builder = - RecordBatchTransformerBuilder::new(task.schema_ref(), task.project_field_ids()); - - // Add the _file metadata column if it's in the projected fields - if task.project_field_ids().contains(&RESERVED_FIELD_ID_FILE) { - let file_datum = Datum::string(task.data_file_path.clone()); - record_batch_transformer_builder = - record_batch_transformer_builder.with_constant(RESERVED_FIELD_ID_FILE, file_datum); - } - - if let (Some(partition_spec), Some(partition_data)) = - (task.partition_spec.clone(), task.partition.clone()) - { - record_batch_transformer_builder = - record_batch_transformer_builder.with_partition(partition_spec, partition_data)?; - } - - let mut record_batch_transformer = record_batch_transformer_builder.build(); - - if let Some(batch_size) = batch_size { - record_batch_stream_builder = record_batch_stream_builder.with_batch_size(batch_size); - } - - let delete_filter = delete_filter_rx.await.unwrap()?; - let delete_predicate = delete_filter.build_equality_delete_predicate(&task).await?; - - // In addition to the optional predicate supplied in the `FileScanTask`, - // we also have an optional predicate resulting from equality delete files. - // If both are present, we logical-AND them together to form a single filter - // predicate that we can pass to the `RecordBatchStreamBuilder`. - let final_predicate = match (&task.predicate, delete_predicate) { - (None, None) => None, - (Some(predicate), None) => Some(predicate.clone()), - (None, Some(ref predicate)) => Some(predicate.clone()), - (Some(filter_predicate), Some(delete_predicate)) => { - Some(filter_predicate.clone().and(delete_predicate)) - } - }; - - // There are three possible sources for potential lists of selected RowGroup indices, - // and two for `RowSelection`s. - // Selected RowGroup index lists can come from three sources: - // * When task.start and task.length specify a byte range (file splitting); - // * When there are equality delete files that are applicable; - // * When there is a scan predicate and row_group_filtering_enabled = true. - // `RowSelection`s can be created in either or both of the following cases: - // * When there are positional delete files that are applicable; - // * When there is a scan predicate and row_selection_enabled = true - // Note that row group filtering from predicates only happens when - // there is a scan predicate AND row_group_filtering_enabled = true, - // but we perform row selection filtering if there are applicable - // equality delete files OR (there is a scan predicate AND row_selection_enabled), - // since the only implemented method of applying positional deletes is - // by using a `RowSelection`. - let mut selected_row_group_indices = None; - let mut row_selection = None; - - // Filter row groups based on byte range from task.start and task.length. - // If both start and length are 0, read the entire file (backwards compatibility). - if task.start != 0 || task.length != 0 { - let byte_range_filtered_row_groups = Self::filter_row_groups_by_byte_range( - record_batch_stream_builder.metadata(), - task.start, - task.length, - )?; - selected_row_group_indices = Some(byte_range_filtered_row_groups); - } - - if let Some(predicate) = final_predicate { - let (iceberg_field_ids, field_id_map) = Self::build_field_id_set_and_map( - record_batch_stream_builder.parquet_schema(), - &predicate, - )?; - - let row_filter = Self::get_row_filter( - &predicate, - record_batch_stream_builder.parquet_schema(), - &iceberg_field_ids, - &field_id_map, - )?; - record_batch_stream_builder = record_batch_stream_builder.with_row_filter(row_filter); - - if row_group_filtering_enabled { - let predicate_filtered_row_groups = Self::get_selected_row_group_indices( - &predicate, - record_batch_stream_builder.metadata(), - &field_id_map, - &task.schema, - )?; - - // Merge predicate-based filtering with byte range filtering (if present) - // by taking the intersection of both filters - selected_row_group_indices = match selected_row_group_indices { - Some(byte_range_filtered) => { - // Keep only row groups that are in both filters - let intersection: Vec = byte_range_filtered - .into_iter() - .filter(|idx| predicate_filtered_row_groups.contains(idx)) - .collect(); - Some(intersection) - } - None => Some(predicate_filtered_row_groups), - }; - } - - if row_selection_enabled { - row_selection = Some(Self::get_row_selection_for_filter_predicate( - &predicate, - record_batch_stream_builder.metadata(), - &selected_row_group_indices, - &field_id_map, - &task.schema, - )?); - } - } - - let positional_delete_indexes = delete_filter.get_delete_vector(&task); - - if let Some(positional_delete_indexes) = positional_delete_indexes { - let delete_row_selection = { - let positional_delete_indexes = positional_delete_indexes.lock().unwrap(); - - Self::build_deletes_row_selection( - record_batch_stream_builder.metadata().row_groups(), - &selected_row_group_indices, - &positional_delete_indexes, - ) - }?; - - // merge the row selection from the delete files with the row selection - // from the filter predicate, if there is one from the filter predicate - row_selection = match row_selection { - None => Some(delete_row_selection), - Some(filter_row_selection) => { - Some(filter_row_selection.intersection(&delete_row_selection)) - } - }; - } - - if let Some(row_selection) = row_selection { - record_batch_stream_builder = - record_batch_stream_builder.with_row_selection(row_selection); - } - - if let Some(selected_row_group_indices) = selected_row_group_indices { - record_batch_stream_builder = - record_batch_stream_builder.with_row_groups(selected_row_group_indices); - } - - // Build the batch stream and send all the RecordBatches that it generates - // to the requester. - let record_batch_stream = - record_batch_stream_builder - .build()? - .map(move |batch| match batch { - Ok(batch) => { - // Process the record batch (type promotion, column reordering, virtual fields, etc.) - record_batch_transformer.process_record_batch(batch) - } - Err(err) => Err(err.into()), - }); - - Ok(Box::pin(record_batch_stream) as ArrowRecordBatchStream) - } - - /// Opens a Parquet file and loads its metadata, returning both the reader and metadata. - /// The reader can be reused to build a `ParquetRecordBatchStreamBuilder` without - /// reopening the file. - pub(crate) async fn open_parquet_file( - data_file_path: &str, - file_io: &FileIO, - file_size_in_bytes: u64, - parquet_read_options: ParquetReadOptions, - ) -> Result<(ArrowFileReader, ArrowReaderMetadata)> { - let parquet_file = file_io.new_input(data_file_path)?; - let parquet_reader = parquet_file.reader().await?; - let mut reader = ArrowFileReader::new( - FileMetadata { - size: file_size_in_bytes, - }, - parquet_reader, - ) - .with_parquet_read_options(parquet_read_options); - - let arrow_metadata = ArrowReaderMetadata::load_async(&mut reader, Default::default()) - .await - .map_err(|e| { - Error::new(ErrorKind::Unexpected, "Failed to load Parquet metadata").with_source(e) - })?; - - Ok((reader, arrow_metadata)) - } - - /// computes a `RowSelection` from positional delete indices. - /// - /// Using the Parquet page index, we build a `RowSelection` that rejects rows that are indicated - /// as having been deleted by a positional delete, taking into account any row groups that have - /// been skipped entirely by the filter predicate - fn build_deletes_row_selection( - row_group_metadata_list: &[RowGroupMetaData], - selected_row_groups: &Option>, - positional_deletes: &DeleteVector, - ) -> Result { - let mut results: Vec = Vec::new(); - let mut selected_row_groups_idx = 0; - let mut current_row_group_base_idx: u64 = 0; - let mut delete_vector_iter = positional_deletes.iter(); - let mut next_deleted_row_idx_opt = delete_vector_iter.next(); - - for (idx, row_group_metadata) in row_group_metadata_list.iter().enumerate() { - let row_group_num_rows = row_group_metadata.num_rows() as u64; - let next_row_group_base_idx = current_row_group_base_idx + row_group_num_rows; - - // if row group selection is enabled, - if let Some(selected_row_groups) = selected_row_groups { - // if we've consumed all the selected row groups, we're done - if selected_row_groups_idx == selected_row_groups.len() { - break; - } - - if idx == selected_row_groups[selected_row_groups_idx] { - // we're in a selected row group. Increment selected_row_groups_idx - // so that next time around the for loop we're looking for the next - // selected row group - selected_row_groups_idx += 1; - } else { - // Advance iterator past all deletes in the skipped row group. - // advance_to() positions the iterator to the first delete >= next_row_group_base_idx. - // However, if our cached next_deleted_row_idx_opt is in the skipped range, - // we need to call next() to update the cache with the newly positioned value. - delete_vector_iter.advance_to(next_row_group_base_idx); - // Only update the cache if the cached value is stale (in the skipped range) - if let Some(cached_idx) = next_deleted_row_idx_opt - && cached_idx < next_row_group_base_idx - { - next_deleted_row_idx_opt = delete_vector_iter.next(); - } - - // still increment the current page base index but then skip to the next row group - // in the file - current_row_group_base_idx += row_group_num_rows; - continue; - } - } - - let mut next_deleted_row_idx = match next_deleted_row_idx_opt { - Some(next_deleted_row_idx) => { - // if the index of the next deleted row is beyond this row group, add a selection for - // the remainder of this row group and skip to the next row group - if next_deleted_row_idx >= next_row_group_base_idx { - results.push(RowSelector::select(row_group_num_rows as usize)); - current_row_group_base_idx += row_group_num_rows; - continue; - } - - next_deleted_row_idx - } - - // If there are no more pos deletes, add a selector for the entirety of this row group. - _ => { - results.push(RowSelector::select(row_group_num_rows as usize)); - current_row_group_base_idx += row_group_num_rows; - continue; - } - }; - - let mut current_idx = current_row_group_base_idx; - 'chunks: while next_deleted_row_idx < next_row_group_base_idx { - // `select` all rows that precede the next delete index - if current_idx < next_deleted_row_idx { - let run_length = next_deleted_row_idx - current_idx; - results.push(RowSelector::select(run_length as usize)); - current_idx += run_length; - } - - // `skip` all consecutive deleted rows in the current row group - let mut run_length = 0; - while next_deleted_row_idx == current_idx - && next_deleted_row_idx < next_row_group_base_idx - { - run_length += 1; - current_idx += 1; - - next_deleted_row_idx_opt = delete_vector_iter.next(); - next_deleted_row_idx = match next_deleted_row_idx_opt { - Some(next_deleted_row_idx) => next_deleted_row_idx, - _ => { - // We've processed the final positional delete. - // Conclude the skip and then break so that we select the remaining - // rows in the row group and move on to the next row group - results.push(RowSelector::skip(run_length)); - break 'chunks; - } - }; - } - if run_length > 0 { - results.push(RowSelector::skip(run_length)); - } - } - - if current_idx < next_row_group_base_idx { - results.push(RowSelector::select( - (next_row_group_base_idx - current_idx) as usize, - )); - } - - current_row_group_base_idx += row_group_num_rows; - } - - Ok(results.into()) - } - - fn build_field_id_set_and_map( - parquet_schema: &SchemaDescriptor, - predicate: &BoundPredicate, - ) -> Result<(HashSet, HashMap)> { - // Collects all Iceberg field IDs referenced in the filter predicate - let mut collector = CollectFieldIdVisitor { - field_ids: HashSet::default(), - }; - visit(&mut collector, predicate)?; - - let iceberg_field_ids = collector.field_ids(); - - // Without embedded field IDs, we fall back to position-based mapping for compatibility - let field_id_map = match build_field_id_map(parquet_schema)? { - Some(map) => map, - None => build_fallback_field_id_map(parquet_schema), - }; - - Ok((iceberg_field_ids, field_id_map)) - } - - /// Recursively extract leaf field IDs because Parquet projection works at the leaf column level. - /// Nested types (struct/list/map) are flattened in Parquet's columnar format. - fn include_leaf_field_id(field: &NestedField, field_ids: &mut Vec) { - match field.field_type.as_ref() { - Type::Primitive(_) => { - field_ids.push(field.id); - } - Type::Struct(struct_type) => { - for nested_field in struct_type.fields() { - Self::include_leaf_field_id(nested_field, field_ids); - } - } - Type::List(list_type) => { - Self::include_leaf_field_id(&list_type.element_field, field_ids); - } - Type::Map(map_type) => { - Self::include_leaf_field_id(&map_type.key_field, field_ids); - Self::include_leaf_field_id(&map_type.value_field, field_ids); - } - } - } - - fn get_arrow_projection_mask( - field_ids: &[i32], - iceberg_schema_of_task: &Schema, - parquet_schema: &SchemaDescriptor, - arrow_schema: &ArrowSchemaRef, - use_fallback: bool, // Whether file lacks embedded field IDs (e.g., migrated from Hive/Spark) - ) -> Result { - fn type_promotion_is_valid( - file_type: Option<&PrimitiveType>, - projected_type: Option<&PrimitiveType>, - ) -> bool { - match (file_type, projected_type) { - (Some(lhs), Some(rhs)) if lhs == rhs => true, - (Some(PrimitiveType::Int), Some(PrimitiveType::Long)) => true, - (Some(PrimitiveType::Float), Some(PrimitiveType::Double)) => true, - ( - Some(PrimitiveType::Decimal { - precision: file_precision, - scale: file_scale, - }), - Some(PrimitiveType::Decimal { - precision: requested_precision, - scale: requested_scale, - }), - ) if requested_precision >= file_precision && file_scale == requested_scale => true, - // Uuid will be store as Fixed(16) in parquet file, so the read back type will be Fixed(16). - (Some(PrimitiveType::Fixed(16)), Some(PrimitiveType::Uuid)) => true, - _ => false, - } - } - - if field_ids.is_empty() { - return Ok(ProjectionMask::all()); - } - - if use_fallback { - // Position-based projection necessary because file lacks embedded field IDs - Self::get_arrow_projection_mask_fallback(field_ids, parquet_schema) - } else { - // Field-ID-based projection using embedded field IDs from Parquet metadata - - // Parquet's columnar format requires leaf-level (not top-level struct/list/map) projection - let mut leaf_field_ids = vec![]; - for field_id in field_ids { - let field = iceberg_schema_of_task.field_by_id(*field_id); - if let Some(field) = field { - Self::include_leaf_field_id(field, &mut leaf_field_ids); - } - } - - Self::get_arrow_projection_mask_with_field_ids( - &leaf_field_ids, - iceberg_schema_of_task, - parquet_schema, - arrow_schema, - type_promotion_is_valid, - ) - } - } - - /// Standard projection using embedded field IDs from Parquet metadata. - /// For iceberg-java compatibility with ParquetSchemaUtil.pruneColumns(). - fn get_arrow_projection_mask_with_field_ids( - leaf_field_ids: &[i32], - iceberg_schema_of_task: &Schema, - parquet_schema: &SchemaDescriptor, - arrow_schema: &ArrowSchemaRef, - type_promotion_is_valid: fn(Option<&PrimitiveType>, Option<&PrimitiveType>) -> bool, - ) -> Result { - let mut column_map = HashMap::new(); - let fields = arrow_schema.fields(); - - // Pre-project only the fields that have been selected, possibly avoiding converting - // some Arrow types that are not yet supported. - let mut projected_fields: HashMap = HashMap::new(); - let projected_arrow_schema = ArrowSchema::new_with_metadata( - fields.filter_leaves(|_, f| { - f.metadata() - .get(PARQUET_FIELD_ID_META_KEY) - .and_then(|field_id| i32::from_str(field_id).ok()) - .is_some_and(|field_id| { - projected_fields.insert((*f).clone(), field_id); - leaf_field_ids.contains(&field_id) - }) - }), - arrow_schema.metadata().clone(), - ); - let iceberg_schema = arrow_schema_to_schema(&projected_arrow_schema)?; - - fields.filter_leaves(|idx, field| { - let Some(field_id) = projected_fields.get(field).cloned() else { - return false; - }; - - let iceberg_field = iceberg_schema_of_task.field_by_id(field_id); - let parquet_iceberg_field = iceberg_schema.field_by_id(field_id); - - if iceberg_field.is_none() || parquet_iceberg_field.is_none() { - return false; - } - - if !type_promotion_is_valid( - parquet_iceberg_field - .unwrap() - .field_type - .as_primitive_type(), - iceberg_field.unwrap().field_type.as_primitive_type(), - ) { - return false; - } - - column_map.insert(field_id, idx); - true - }); - - // Schema evolution: New columns may not exist in old Parquet files. - // We only project existing columns; RecordBatchTransformer adds default/NULL values. - let mut indices = vec![]; - for field_id in leaf_field_ids { - if let Some(col_idx) = column_map.get(field_id) { - indices.push(*col_idx); - } - } - - if indices.is_empty() { - // Edge case: All requested columns are new (don't exist in file). - // Project all columns so RecordBatchTransformer has a batch to transform. - Ok(ProjectionMask::all()) - } else { - Ok(ProjectionMask::leaves(parquet_schema, indices)) - } - } - - /// Fallback projection for Parquet files without field IDs. - /// Uses position-based matching: field ID N → column position N-1. - /// Projects entire top-level columns (including nested content) for iceberg-java compatibility. - fn get_arrow_projection_mask_fallback( - field_ids: &[i32], - parquet_schema: &SchemaDescriptor, - ) -> Result { - // Position-based: field_id N → column N-1 (field IDs are 1-indexed) - let parquet_root_fields = parquet_schema.root_schema().get_fields(); - let mut root_indices = vec![]; - - for field_id in field_ids.iter() { - let parquet_pos = (*field_id - 1) as usize; - - if parquet_pos < parquet_root_fields.len() { - root_indices.push(parquet_pos); - } - // RecordBatchTransformer adds missing columns with NULL values - } - - if root_indices.is_empty() { - Ok(ProjectionMask::all()) - } else { - Ok(ProjectionMask::roots(parquet_schema, root_indices)) - } - } - - fn get_row_filter( - predicates: &BoundPredicate, - parquet_schema: &SchemaDescriptor, - iceberg_field_ids: &HashSet, - field_id_map: &HashMap, - ) -> Result { - // Collect Parquet column indices from field ids. - // If the field id is not found in Parquet schema, it will be ignored due to schema evolution. - let mut column_indices = iceberg_field_ids - .iter() - .filter_map(|field_id| field_id_map.get(field_id).cloned()) - .collect::>(); - column_indices.sort(); - - // The converter that converts `BoundPredicates` to `ArrowPredicates` - let mut converter = PredicateConverter { - parquet_schema, - column_map: field_id_map, - column_indices: &column_indices, - }; - - // After collecting required leaf column indices used in the predicate, - // creates the projection mask for the Arrow predicates. - let projection_mask = ProjectionMask::leaves(parquet_schema, column_indices.clone()); - let predicate_func = visit(&mut converter, predicates)?; - let arrow_predicate = ArrowPredicateFn::new(projection_mask, predicate_func); - Ok(RowFilter::new(vec![Box::new(arrow_predicate)])) - } - - fn get_selected_row_group_indices( - predicate: &BoundPredicate, - parquet_metadata: &Arc, - field_id_map: &HashMap, - snapshot_schema: &Schema, - ) -> Result> { - let row_groups_metadata = parquet_metadata.row_groups(); - let mut results = Vec::with_capacity(row_groups_metadata.len()); - - for (idx, row_group_metadata) in row_groups_metadata.iter().enumerate() { - if RowGroupMetricsEvaluator::eval( - predicate, - row_group_metadata, - field_id_map, - snapshot_schema, - )? { - results.push(idx); - } - } - - Ok(results) - } - - fn get_row_selection_for_filter_predicate( - predicate: &BoundPredicate, - parquet_metadata: &Arc, - selected_row_groups: &Option>, - field_id_map: &HashMap, - snapshot_schema: &Schema, - ) -> Result { - let Some(column_index) = parquet_metadata.column_index() else { - return Err(Error::new( - ErrorKind::Unexpected, - "Parquet file metadata does not contain a column index", - )); - }; - - let Some(offset_index) = parquet_metadata.offset_index() else { - return Err(Error::new( - ErrorKind::Unexpected, - "Parquet file metadata does not contain an offset index", - )); - }; - - // If all row groups were filtered out, return an empty RowSelection (select no rows) - if let Some(selected_row_groups) = selected_row_groups - && selected_row_groups.is_empty() - { - return Ok(RowSelection::from(Vec::new())); - } - - let mut selected_row_groups_idx = 0; - - let page_index = column_index - .iter() - .enumerate() - .zip(offset_index) - .zip(parquet_metadata.row_groups()); - - let mut results = Vec::new(); - for (((idx, column_index), offset_index), row_group_metadata) in page_index { - if let Some(selected_row_groups) = selected_row_groups { - // skip row groups that aren't present in selected_row_groups - if idx == selected_row_groups[selected_row_groups_idx] { - selected_row_groups_idx += 1; - } else { - continue; - } - } - - let selections_for_page = PageIndexEvaluator::eval( - predicate, - column_index, - offset_index, - row_group_metadata, - field_id_map, - snapshot_schema, - )?; - - results.push(selections_for_page); - - if let Some(selected_row_groups) = selected_row_groups - && selected_row_groups_idx == selected_row_groups.len() - { - break; - } - } - - Ok(results.into_iter().flatten().collect::>().into()) - } - - /// Filters row groups by byte range to support Iceberg's file splitting. - /// - /// Iceberg splits large files at row group boundaries, so we only read row groups - /// whose byte ranges overlap with [start, start+length). - fn filter_row_groups_by_byte_range( - parquet_metadata: &Arc, - start: u64, - length: u64, - ) -> Result> { - let row_groups = parquet_metadata.row_groups(); - let mut selected = Vec::new(); - let end = start + length; - - // Row groups are stored sequentially after the 4-byte magic header. - let mut current_byte_offset = 4u64; - - for (idx, row_group) in row_groups.iter().enumerate() { - let row_group_size = row_group.compressed_size() as u64; - let row_group_end = current_byte_offset + row_group_size; - - if current_byte_offset < end && start < row_group_end { - selected.push(idx); - } - - current_byte_offset = row_group_end; - } - - Ok(selected) - } -} - -/// Build the map of parquet field id to Parquet column index in the schema. -/// Returns None if the Parquet file doesn't have field IDs embedded (e.g., migrated tables). -fn build_field_id_map(parquet_schema: &SchemaDescriptor) -> Result>> { - let mut column_map = HashMap::new(); - - for (idx, field) in parquet_schema.columns().iter().enumerate() { - let field_type = field.self_type(); - match field_type { - ParquetType::PrimitiveType { basic_info, .. } => { - if !basic_info.has_id() { - return Ok(None); - } - column_map.insert(basic_info.id(), idx); - } - ParquetType::GroupType { .. } => { - return Err(Error::new( - ErrorKind::DataInvalid, - format!( - "Leave column in schema should be primitive type but got {field_type:?}" - ), - )); - } - }; - } - - Ok(Some(column_map)) -} - -/// Build a fallback field ID map for Parquet files without embedded field IDs. -/// Position-based (1, 2, 3, ...) for compatibility with iceberg-java migrations. -fn build_fallback_field_id_map(parquet_schema: &SchemaDescriptor) -> HashMap { - let mut column_map = HashMap::new(); - - // 1-indexed to match iceberg-java's convention - for (idx, _field) in parquet_schema.columns().iter().enumerate() { - let field_id = (idx + 1) as i32; - column_map.insert(field_id, idx); - } - - column_map -} - -/// Apply name mapping to Arrow schema for Parquet files lacking field IDs. -/// -/// Assigns Iceberg field IDs based on column names using the name mapping, -/// enabling correct projection on migrated files (e.g., from Hive/Spark via add_files). -/// -/// Per Iceberg spec Column Projection rule #2: -/// "Use schema.name-mapping.default metadata to map field id to columns without field id" -/// https://iceberg.apache.org/spec/#column-projection -/// -/// Corresponds to Java's ParquetSchemaUtil.applyNameMapping() and ApplyNameMapping visitor. -/// The key difference is Java operates on Parquet MessageType, while we operate on Arrow Schema. -/// -/// # Arguments -/// * `arrow_schema` - Arrow schema from Parquet file (without field IDs) -/// * `name_mapping` - Name mapping from table metadata (TableProperties.DEFAULT_NAME_MAPPING) -/// -/// # Returns -/// Arrow schema with field IDs assigned based on name mapping -fn apply_name_mapping_to_arrow_schema( - arrow_schema: ArrowSchemaRef, - name_mapping: &NameMapping, -) -> Result> { - debug_assert!( - arrow_schema - .fields() - .iter() - .next() - .is_none_or(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()), - "Schema already has field IDs - name mapping should not be applied" - ); - - use arrow_schema::Field; - - let fields_with_mapped_ids: Vec<_> = arrow_schema - .fields() - .iter() - .map(|field| { - // Look up this column name in name mapping to get the Iceberg field ID. - // Corresponds to Java's ApplyNameMapping visitor which calls - // nameMapping.find(currentPath()) and returns field.withId() if found. - // - // If the field isn't in the mapping, leave it WITHOUT assigning an ID - // (matching Java's behavior of returning the field unchanged). - // Later, during projection, fields without IDs are filtered out. - let mapped_field_opt = name_mapping - .fields() - .iter() - .find(|f| f.names().contains(&field.name().to_string())); - - let mut metadata = field.metadata().clone(); - - if let Some(mapped_field) = mapped_field_opt - && let Some(field_id) = mapped_field.field_id() - { - // Field found in mapping with a field_id → assign it - metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(), field_id.to_string()); - } - // If field_id is None, leave the field without an ID (will be filtered by projection) - - Field::new(field.name(), field.data_type().clone(), field.is_nullable()) - .with_metadata(metadata) - }) - .collect(); - - Ok(Arc::new(ArrowSchema::new_with_metadata( - fields_with_mapped_ids, - arrow_schema.metadata().clone(), - ))) -} - -/// Add position-based fallback field IDs to Arrow schema for Parquet files lacking them. -/// Enables projection on migrated files (e.g., from Hive/Spark). -/// -/// Why at schema level (not per-batch): Efficiency - avoids repeated schema modification. -/// Why only top-level: Nested projection uses leaf column indices, not parent struct IDs. -/// Why 1-indexed: Compatibility with iceberg-java's ParquetSchemaUtil.addFallbackIds(). -fn add_fallback_field_ids_to_arrow_schema(arrow_schema: &ArrowSchemaRef) -> Arc { - debug_assert!( - arrow_schema - .fields() - .iter() - .next() - .is_none_or(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()), - "Schema already has field IDs" - ); - - use arrow_schema::Field; - - let fields_with_fallback_ids: Vec<_> = arrow_schema - .fields() - .iter() - .enumerate() - .map(|(pos, field)| { - let mut metadata = field.metadata().clone(); - let field_id = (pos + 1) as i32; // 1-indexed for Java compatibility - metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(), field_id.to_string()); - - Field::new(field.name(), field.data_type().clone(), field.is_nullable()) - .with_metadata(metadata) - }) - .collect(); - - Arc::new(ArrowSchema::new_with_metadata( - fields_with_fallback_ids, - arrow_schema.metadata().clone(), - )) -} - -/// A visitor to collect field ids from bound predicates. -struct CollectFieldIdVisitor { - field_ids: HashSet, -} - -impl CollectFieldIdVisitor { - fn field_ids(self) -> HashSet { - self.field_ids - } -} - -impl BoundPredicateVisitor for CollectFieldIdVisitor { - type T = (); - - fn always_true(&mut self) -> Result<()> { - Ok(()) - } - - fn always_false(&mut self) -> Result<()> { - Ok(()) - } - - fn and(&mut self, _lhs: (), _rhs: ()) -> Result<()> { - Ok(()) - } - - fn or(&mut self, _lhs: (), _rhs: ()) -> Result<()> { - Ok(()) - } - - fn not(&mut self, _inner: ()) -> Result<()> { - Ok(()) - } - - fn is_null(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn not_null(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn is_nan(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn not_nan(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn less_than( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn less_than_or_eq( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn greater_than( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn greater_than_or_eq( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn eq( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn not_eq( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn starts_with( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn not_starts_with( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn r#in( - &mut self, - reference: &BoundReference, - _literals: &FnvHashSet, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn not_in( - &mut self, - reference: &BoundReference, - _literals: &FnvHashSet, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } -} - -/// A visitor to convert Iceberg bound predicates to Arrow predicates. -struct PredicateConverter<'a> { - /// The Parquet schema descriptor. - pub parquet_schema: &'a SchemaDescriptor, - /// The map between field id and leaf column index in Parquet schema. - pub column_map: &'a HashMap, - /// The required column indices in Parquet schema for the predicates. - pub column_indices: &'a Vec, -} - -impl PredicateConverter<'_> { - /// When visiting a bound reference, we return index of the leaf column in the - /// required column indices which is used to project the column in the record batch. - /// Return None if the field id is not found in the column map, which is possible - /// due to schema evolution. - fn bound_reference(&mut self, reference: &BoundReference) -> Result> { - // The leaf column's index in Parquet schema. - if let Some(column_idx) = self.column_map.get(&reference.field().id) { - if self.parquet_schema.get_column_root(*column_idx).is_group() { - return Err(Error::new( - ErrorKind::DataInvalid, - format!( - "Leave column `{}` in predicates isn't a root column in Parquet schema.", - reference.field().name - ), - )); - } - - // The leaf column's index in the required column indices. - let index = self - .column_indices - .iter() - .position(|&idx| idx == *column_idx) - .ok_or(Error::new( - ErrorKind::DataInvalid, - format!( - "Leave column `{}` in predicates cannot be found in the required column indices.", - reference.field().name - ), - ))?; - - Ok(Some(index)) - } else { - Ok(None) - } - } - - /// Build an Arrow predicate that always returns true. - fn build_always_true(&self) -> Result> { - Ok(Box::new(|batch| { - Ok(BooleanArray::from(vec![true; batch.num_rows()])) - })) - } - - /// Build an Arrow predicate that always returns false. - fn build_always_false(&self) -> Result> { - Ok(Box::new(|batch| { - Ok(BooleanArray::from(vec![false; batch.num_rows()])) - })) - } -} - -/// Gets the leaf column from the record batch for the required column index. Only -/// supports top-level columns for now. -fn project_column( - batch: &RecordBatch, - column_idx: usize, -) -> std::result::Result { - let column = batch.column(column_idx); - - match column.data_type() { - DataType::Struct(_) => Err(ArrowError::SchemaError( - "Does not support struct column yet.".to_string(), - )), - _ => Ok(column.clone()), - } -} - -type PredicateResult = - dyn FnMut(RecordBatch) -> std::result::Result + Send + 'static; - -impl BoundPredicateVisitor for PredicateConverter<'_> { - type T = Box; - - fn always_true(&mut self) -> Result> { - self.build_always_true() - } - - fn always_false(&mut self) -> Result> { - self.build_always_false() - } - - fn and( - &mut self, - mut lhs: Box, - mut rhs: Box, - ) -> Result> { - Ok(Box::new(move |batch| { - let left = lhs(batch.clone())?; - let right = rhs(batch)?; - and_kleene(&left, &right) - })) - } - - fn or( - &mut self, - mut lhs: Box, - mut rhs: Box, - ) -> Result> { - Ok(Box::new(move |batch| { - let left = lhs(batch.clone())?; - let right = rhs(batch)?; - or_kleene(&left, &right) - })) - } - - fn not(&mut self, mut inner: Box) -> Result> { - Ok(Box::new(move |batch| { - let pred_ret = inner(batch)?; - not(&pred_ret) - })) - } - - fn is_null( - &mut self, - reference: &BoundReference, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - Ok(Box::new(move |batch| { - let column = project_column(&batch, idx)?; - is_null(&column) - })) - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } - - fn not_null( - &mut self, - reference: &BoundReference, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - Ok(Box::new(move |batch| { - let column = project_column(&batch, idx)?; - is_not_null(&column) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn is_nan( - &mut self, - reference: &BoundReference, - _predicate: &BoundPredicate, - ) -> Result> { - if self.bound_reference(reference)?.is_some() { - self.build_always_true() - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn not_nan( - &mut self, - reference: &BoundReference, - _predicate: &BoundPredicate, - ) -> Result> { - if self.bound_reference(reference)?.is_some() { - self.build_always_false() - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } - - fn less_than( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - lt(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } - - fn less_than_or_eq( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - lt_eq(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } - - fn greater_than( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - gt(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn greater_than_or_eq( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - gt_eq(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn eq( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - eq(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn not_eq( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - neq(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn starts_with( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - starts_with(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn not_starts_with( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - // update here if arrow ever adds a native not_starts_with - not(&starts_with(&left, literal.as_ref())?) - })) - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } - - fn r#in( - &mut self, - reference: &BoundReference, - literals: &FnvHashSet, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literals: Vec<_> = literals - .iter() - .map(|lit| get_arrow_datum(lit).unwrap()) - .collect(); - - Ok(Box::new(move |batch| { - // update this if arrow ever adds a native is_in kernel - let left = project_column(&batch, idx)?; - - let mut acc = BooleanArray::from(vec![false; batch.num_rows()]); - for literal in &literals { - let literal = try_cast_literal(literal, left.data_type())?; - acc = or(&acc, &eq(&left, literal.as_ref())?)? - } - - Ok(acc) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn not_in( - &mut self, - reference: &BoundReference, - literals: &FnvHashSet, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literals: Vec<_> = literals - .iter() - .map(|lit| get_arrow_datum(lit).unwrap()) - .collect(); - - Ok(Box::new(move |batch| { - // update this if arrow ever adds a native not_in kernel - let left = project_column(&batch, idx)?; - let mut acc = BooleanArray::from(vec![true; batch.num_rows()]); - for literal in &literals { - let literal = try_cast_literal(literal, left.data_type())?; - acc = and(&acc, &neq(&left, literal.as_ref())?)? - } - - Ok(acc) - })) - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } -} - -/// ArrowFileReader is a wrapper around a FileRead that impls parquets AsyncFileReader. -pub struct ArrowFileReader { - meta: FileMetadata, - parquet_read_options: ParquetReadOptions, - r: Box, -} - -impl ArrowFileReader { - /// Create a new ArrowFileReader - pub fn new(meta: FileMetadata, r: Box) -> Self { - Self { - meta, - parquet_read_options: ParquetReadOptions::builder().build(), - r, - } - } - - /// Configure all Parquet read options. - pub(crate) fn with_parquet_read_options(mut self, options: ParquetReadOptions) -> Self { - self.parquet_read_options = options; - self - } -} - -impl AsyncFileReader for ArrowFileReader { - fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, parquet::errors::Result> { - Box::pin( - self.r - .read(range.start..range.end) - .map_err(|err| parquet::errors::ParquetError::External(Box::new(err))), - ) - } - - /// Override the default `get_byte_ranges` which calls `get_bytes` sequentially. - /// The parquet reader calls this to fetch column chunks for a row group, so - /// without this override each column chunk is a serial round-trip to object storage. - /// Adapted from object_store's `coalesce_ranges` in `util.rs`. - fn get_byte_ranges( - &mut self, - ranges: Vec>, - ) -> BoxFuture<'_, parquet::errors::Result>> { - let coalesce_bytes = self.parquet_read_options.range_coalesce_bytes(); - let concurrency = self.parquet_read_options.range_fetch_concurrency().max(1); - - async move { - // Merge nearby ranges to reduce the number of object store requests. - let fetch_ranges = merge_ranges(&ranges, coalesce_bytes); - let r = &self.r; - - // Fetch merged ranges concurrently. - let fetched: Vec = futures::stream::iter(fetch_ranges.iter().cloned()) - .map(|range| async move { - r.read(range) - .await - .map_err(|e| parquet::errors::ParquetError::External(Box::new(e))) - }) - .buffered(concurrency) - .try_collect() - .await?; - - // Slice the fetched data back into the originally requested ranges. - Ok(ranges - .iter() - .map(|range| { - let idx = fetch_ranges.partition_point(|v| v.start <= range.start) - 1; - let fetch_range = &fetch_ranges[idx]; - let fetch_bytes = &fetched[idx]; - let start = (range.start - fetch_range.start) as usize; - let end = (range.end - fetch_range.start) as usize; - fetch_bytes.slice(start..end.min(fetch_bytes.len())) - }) - .collect()) - } - .boxed() - } - - // TODO: currently we don't respect `ArrowReaderOptions` cause it don't expose any method to access the option field - // we will fix it after `v55.1.0` is released in https://github.com/apache/arrow-rs/issues/7393 - fn get_metadata( - &mut self, - _options: Option<&'_ ArrowReaderOptions>, - ) -> BoxFuture<'_, parquet::errors::Result>> { - async move { - let reader = ParquetMetaDataReader::new() - .with_prefetch_hint(self.parquet_read_options.metadata_size_hint()) - // Set the page policy first because it updates both column and offset policies. - .with_page_index_policy(PageIndexPolicy::from( - self.parquet_read_options.preload_page_index(), - )) - .with_column_index_policy(PageIndexPolicy::from( - self.parquet_read_options.preload_column_index(), - )) - .with_offset_index_policy(PageIndexPolicy::from( - self.parquet_read_options.preload_offset_index(), - )); - let size = self.meta.size; - let meta = reader.load_and_finish(self, size).await?; - - Ok(Arc::new(meta)) - } - .boxed() - } -} - -/// Merge overlapping or nearby byte ranges, combining ranges with gaps <= `coalesce` bytes. -/// Adapted from object_store's `merge_ranges` in `util.rs`. -fn merge_ranges(ranges: &[Range], coalesce: u64) -> Vec> { - if ranges.is_empty() { - return vec![]; - } - - let mut ranges = ranges.to_vec(); - ranges.sort_unstable_by_key(|r| r.start); - - let mut merged = Vec::with_capacity(ranges.len()); - let mut start_idx = 0; - let mut end_idx = 1; - - while start_idx != ranges.len() { - let mut range_end = ranges[start_idx].end; - - while end_idx != ranges.len() - && ranges[end_idx] - .start - .checked_sub(range_end) - .map(|delta| delta <= coalesce) - .unwrap_or(true) - { - range_end = range_end.max(ranges[end_idx].end); - end_idx += 1; - } - - merged.push(ranges[start_idx].start..range_end); - start_idx = end_idx; - end_idx += 1; - } - - merged -} - -/// The Arrow type of an array that the Parquet reader reads may not match the exact Arrow type -/// that Iceberg uses for literals - but they are effectively the same logical type, -/// i.e. LargeUtf8 and Utf8 or Utf8View and Utf8 or Utf8View and LargeUtf8. -/// -/// The Arrow compute kernels that we use must match the type exactly, so first cast the literal -/// into the type of the batch we read from Parquet before sending it to the compute kernel. -fn try_cast_literal( - literal: &Arc, - column_type: &DataType, -) -> std::result::Result, ArrowError> { - let literal_array = literal.get().0; - - // No cast required - if literal_array.data_type() == column_type { - return Ok(Arc::clone(literal)); - } - - let literal_array = cast(literal_array, column_type)?; - Ok(Arc::new(Scalar::new(literal_array))) -} - -#[cfg(test)] -mod tests { - use std::collections::{HashMap, HashSet}; - use std::fs::File; - use std::ops::Range; - use std::sync::Arc; - - use arrow_array::cast::AsArray; - use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, StringArray}; - use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; - use futures::TryStreamExt; - use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; - use parquet::arrow::{ArrowWriter, ProjectionMask}; - use parquet::basic::Compression; - use parquet::file::metadata::{ColumnChunkMetaData, RowGroupMetaData}; - use parquet::file::properties::WriterProperties; - use parquet::schema::parser::parse_message_type; - use parquet::schema::types::{SchemaDescPtr, SchemaDescriptor}; - use roaring::RoaringTreemap; - use tempfile::TempDir; - - use crate::ErrorKind; - use crate::arrow::reader::{CollectFieldIdVisitor, PARQUET_FIELD_ID_META_KEY}; - use crate::arrow::{ArrowReader, ArrowReaderBuilder}; - use crate::delete_vector::DeleteVector; - use crate::expr::visitors::bound_predicate_visitor::visit; - use crate::expr::{Bind, Predicate, Reference}; - use crate::io::FileIO; - use crate::scan::{FileScanTask, FileScanTaskDeleteFile, FileScanTaskStream}; - use crate::spec::{ - DataContentType, DataFileFormat, Datum, NestedField, PrimitiveType, Schema, SchemaRef, Type, - }; - - fn table_schema_simple() -> SchemaRef { - Arc::new( - Schema::builder() - .with_schema_id(1) - .with_identifier_field_ids(vec![2]) - .with_fields(vec![ - NestedField::optional(1, "foo", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(2, "bar", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(3, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), - NestedField::optional(4, "qux", Type::Primitive(PrimitiveType::Float)).into(), - ]) - .build() - .unwrap(), - ) - } - - #[test] - fn test_collect_field_id() { - let schema = table_schema_simple(); - let expr = Reference::new("qux").is_null(); - let bound_expr = expr.bind(schema, true).unwrap(); - - let mut visitor = CollectFieldIdVisitor { - field_ids: HashSet::default(), - }; - visit(&mut visitor, &bound_expr).unwrap(); - - let mut expected = HashSet::default(); - expected.insert(4_i32); - - assert_eq!(visitor.field_ids, expected); - } - - #[test] - fn test_collect_field_id_with_and() { - let schema = table_schema_simple(); - let expr = Reference::new("qux") - .is_null() - .and(Reference::new("baz").is_null()); - let bound_expr = expr.bind(schema, true).unwrap(); - - let mut visitor = CollectFieldIdVisitor { - field_ids: HashSet::default(), - }; - visit(&mut visitor, &bound_expr).unwrap(); - - let mut expected = HashSet::default(); - expected.insert(4_i32); - expected.insert(3); - - assert_eq!(visitor.field_ids, expected); - } - - #[test] - fn test_collect_field_id_with_or() { - let schema = table_schema_simple(); - let expr = Reference::new("qux") - .is_null() - .or(Reference::new("baz").is_null()); - let bound_expr = expr.bind(schema, true).unwrap(); - - let mut visitor = CollectFieldIdVisitor { - field_ids: HashSet::default(), - }; - visit(&mut visitor, &bound_expr).unwrap(); - - let mut expected = HashSet::default(); - expected.insert(4_i32); - expected.insert(3); - - assert_eq!(visitor.field_ids, expected); - } - - #[test] - fn test_arrow_projection_mask() { - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_identifier_field_ids(vec![1]) - .with_fields(vec![ - NestedField::required(1, "c1", Type::Primitive(PrimitiveType::String)).into(), - NestedField::optional(2, "c2", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional( - 3, - "c3", - Type::Primitive(PrimitiveType::Decimal { - precision: 38, - scale: 3, - }), - ) - .into(), - ]) - .build() - .unwrap(), - ); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("c1", DataType::Utf8, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - // Type not supported - Field::new("c2", DataType::Duration(TimeUnit::Microsecond), true).with_metadata( - HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "2".to_string())]), - ), - // Precision is beyond the supported range - Field::new("c3", DataType::Decimal128(39, 3), true).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "3".to_string(), - )])), - ])); - - let message_type = " -message schema { - required binary c1 (STRING) = 1; - optional int32 c2 (INTEGER(8,true)) = 2; - optional fixed_len_byte_array(17) c3 (DECIMAL(39,3)) = 3; -} - "; - let parquet_type = parse_message_type(message_type).expect("should parse schema"); - let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_type)); - - // Try projecting the fields c2 and c3 with the unsupported data types - let err = ArrowReader::get_arrow_projection_mask( - &[1, 2, 3], - &schema, - &parquet_schema, - &arrow_schema, - false, - ) - .unwrap_err(); - - assert_eq!(err.kind(), ErrorKind::DataInvalid); - assert_eq!( - err.to_string(), - "DataInvalid => Unsupported Arrow data type: Duration(µs)".to_string() - ); - - // Omitting field c2, we still get an error due to c3 being selected - let err = ArrowReader::get_arrow_projection_mask( - &[1, 3], - &schema, - &parquet_schema, - &arrow_schema, - false, - ) - .unwrap_err(); - - assert_eq!(err.kind(), ErrorKind::DataInvalid); - assert_eq!( - err.to_string(), - "DataInvalid => Failed to create decimal type, source: DataInvalid => Decimals with precision larger than 38 are not supported: 39".to_string() - ); - - // Finally avoid selecting fields with unsupported data types - let mask = ArrowReader::get_arrow_projection_mask( - &[1], - &schema, - &parquet_schema, - &arrow_schema, - false, - ) - .expect("Some ProjectionMask"); - assert_eq!(mask, ProjectionMask::leaves(&parquet_schema, vec![0])); - } - - #[tokio::test] - async fn test_kleene_logic_or_behaviour() { - // a IS NULL OR a = 'foo' - let predicate = Reference::new("a") - .is_null() - .or(Reference::new("a").equal_to(Datum::string("foo"))); - - // Table data: [NULL, "foo", "bar"] - let data_for_col_a = vec![None, Some("foo".to_string()), Some("bar".to_string())]; - - // Expected: [NULL, "foo"]. - let expected = vec![None, Some("foo".to_string())]; - - let (file_io, schema, table_location, _temp_dir) = - setup_kleene_logic(data_for_col_a, DataType::Utf8); - let reader = ArrowReaderBuilder::new(file_io).build(); - - let result_data = test_perform_read(predicate, schema, table_location, reader).await; - - assert_eq!(result_data, expected); - } - - #[tokio::test] - async fn test_kleene_logic_and_behaviour() { - // a IS NOT NULL AND a != 'foo' - let predicate = Reference::new("a") - .is_not_null() - .and(Reference::new("a").not_equal_to(Datum::string("foo"))); - - // Table data: [NULL, "foo", "bar"] - let data_for_col_a = vec![None, Some("foo".to_string()), Some("bar".to_string())]; - - // Expected: ["bar"]. - let expected = vec![Some("bar".to_string())]; - - let (file_io, schema, table_location, _temp_dir) = - setup_kleene_logic(data_for_col_a, DataType::Utf8); - let reader = ArrowReaderBuilder::new(file_io).build(); - - let result_data = test_perform_read(predicate, schema, table_location, reader).await; - - assert_eq!(result_data, expected); - } - - #[tokio::test] - async fn test_predicate_cast_literal() { - let predicates = vec![ - // a == 'foo' - (Reference::new("a").equal_to(Datum::string("foo")), vec![ - Some("foo".to_string()), - ]), - // a != 'foo' - ( - Reference::new("a").not_equal_to(Datum::string("foo")), - vec![Some("bar".to_string())], - ), - // STARTS_WITH(a, 'foo') - (Reference::new("a").starts_with(Datum::string("f")), vec![ - Some("foo".to_string()), - ]), - // NOT STARTS_WITH(a, 'foo') - ( - Reference::new("a").not_starts_with(Datum::string("f")), - vec![Some("bar".to_string())], - ), - // a < 'foo' - (Reference::new("a").less_than(Datum::string("foo")), vec![ - Some("bar".to_string()), - ]), - // a <= 'foo' - ( - Reference::new("a").less_than_or_equal_to(Datum::string("foo")), - vec![Some("foo".to_string()), Some("bar".to_string())], - ), - // a > 'foo' - ( - Reference::new("a").greater_than(Datum::string("bar")), - vec![Some("foo".to_string())], - ), - // a >= 'foo' - ( - Reference::new("a").greater_than_or_equal_to(Datum::string("foo")), - vec![Some("foo".to_string())], - ), - // a IN ('foo', 'bar') - ( - Reference::new("a").is_in([Datum::string("foo"), Datum::string("baz")]), - vec![Some("foo".to_string())], - ), - // a NOT IN ('foo', 'bar') - ( - Reference::new("a").is_not_in([Datum::string("foo"), Datum::string("baz")]), - vec![Some("bar".to_string())], - ), - ]; - - // Table data: ["foo", "bar"] - let data_for_col_a = vec![Some("foo".to_string()), Some("bar".to_string())]; - - let (file_io, schema, table_location, _temp_dir) = - setup_kleene_logic(data_for_col_a, DataType::LargeUtf8); - let reader = ArrowReaderBuilder::new(file_io).build(); - - for (predicate, expected) in predicates { - println!("testing predicate {predicate}"); - let result_data = test_perform_read( - predicate.clone(), - schema.clone(), - table_location.clone(), - reader.clone(), - ) - .await; - - assert_eq!(result_data, expected, "predicate={predicate}"); - } - } - - async fn test_perform_read( - predicate: Predicate, - schema: SchemaRef, - table_location: String, - reader: ArrowReader, - ) -> Vec> { - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1], - predicate: Some(predicate.bind(schema, true).unwrap()), - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - result[0].columns()[0] - .as_string_opt::() - .unwrap() - .iter() - .map(|v| v.map(ToOwned::to_owned)) - .collect::>() - } - - fn setup_kleene_logic( - data_for_col_a: Vec>, - col_a_type: DataType, - ) -> (FileIO, SchemaRef, String, TempDir) { - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional(1, "a", Type::Primitive(PrimitiveType::String)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("a", col_a_type.clone(), true).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - - let file_io = FileIO::new_with_fs(); - - let col = match col_a_type { - DataType::Utf8 => Arc::new(StringArray::from(data_for_col_a)) as ArrayRef, - DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data_for_col_a)) as ArrayRef, - _ => panic!("unexpected col_a_type"), - }; - - let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![col]).unwrap(); - - // Write the Parquet files - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = - ArrowWriter::try_new(file, to_write.schema(), Some(props.clone())).unwrap(); - - writer.write(&to_write).expect("Writing batch"); - - // writer must be closed to write footer - writer.close().unwrap(); - - (file_io, schema, table_location, tmp_dir) - } - - #[test] - fn test_build_deletes_row_selection() { - let schema_descr = get_test_schema_descr(); - - let mut columns = vec![]; - for ptr in schema_descr.columns() { - let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap(); - columns.push(column); - } - - let row_groups_metadata = vec![ - build_test_row_group_meta(schema_descr.clone(), columns.clone(), 1000, 0), - build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 1), - build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 2), - build_test_row_group_meta(schema_descr.clone(), columns.clone(), 1000, 3), - build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 4), - ]; - - let selected_row_groups = Some(vec![1, 3]); - - /* cases to cover: - * {skip|select} {first|intermediate|last} {one row|multiple rows} in - {first|intermediate|last} {skipped|selected} row group - * row group selection disabled - */ - - let positional_deletes = RoaringTreemap::from_iter(&[ - 1, // in skipped rg 0, should be ignored - 3, // run of three consecutive items in skipped rg0 - 4, 5, 998, // two consecutive items at end of skipped rg0 - 999, 1000, // solitary row at start of selected rg1 (1, 9) - 1010, // run of 3 rows in selected rg1 - 1011, 1012, // (3, 485) - 1498, // run of two items at end of selected rg1 - 1499, 1500, // run of two items at start of skipped rg2 - 1501, 1600, // should ignore, in skipped rg2 - 1999, // single row at end of skipped rg2 - 2000, // run of two items at start of selected rg3 - 2001, // (4, 98) - 2100, // single row in selected row group 3 (1, 99) - 2200, // run of 3 consecutive rows in selected row group 3 - 2201, 2202, // (3, 796) - 2999, // single item at end of selected rg3 (1) - 3000, // single item at start of skipped rg4 - ]); - - let positional_deletes = DeleteVector::new(positional_deletes); - - // using selected row groups 1 and 3 - let result = ArrowReader::build_deletes_row_selection( - &row_groups_metadata, - &selected_row_groups, - &positional_deletes, - ) - .unwrap(); - - let expected = RowSelection::from(vec![ - RowSelector::skip(1), - RowSelector::select(9), - RowSelector::skip(3), - RowSelector::select(485), - RowSelector::skip(4), - RowSelector::select(98), - RowSelector::skip(1), - RowSelector::select(99), - RowSelector::skip(3), - RowSelector::select(796), - RowSelector::skip(1), - ]); - - assert_eq!(result, expected); - - // selecting all row groups - let result = ArrowReader::build_deletes_row_selection( - &row_groups_metadata, - &None, - &positional_deletes, - ) - .unwrap(); - - let expected = RowSelection::from(vec![ - RowSelector::select(1), - RowSelector::skip(1), - RowSelector::select(1), - RowSelector::skip(3), - RowSelector::select(992), - RowSelector::skip(3), - RowSelector::select(9), - RowSelector::skip(3), - RowSelector::select(485), - RowSelector::skip(4), - RowSelector::select(98), - RowSelector::skip(1), - RowSelector::select(398), - RowSelector::skip(3), - RowSelector::select(98), - RowSelector::skip(1), - RowSelector::select(99), - RowSelector::skip(3), - RowSelector::select(796), - RowSelector::skip(2), - RowSelector::select(499), - ]); - - assert_eq!(result, expected); - } - - fn build_test_row_group_meta( - schema_descr: SchemaDescPtr, - columns: Vec, - num_rows: i64, - ordinal: i16, - ) -> RowGroupMetaData { - RowGroupMetaData::builder(schema_descr.clone()) - .set_num_rows(num_rows) - .set_total_byte_size(2000) - .set_column_metadata(columns) - .set_ordinal(ordinal) - .build() - .unwrap() - } - - fn get_test_schema_descr() -> SchemaDescPtr { - use parquet::schema::types::Type as SchemaType; - - let schema = SchemaType::group_type_builder("schema") - .with_fields(vec![ - Arc::new( - SchemaType::primitive_type_builder("a", parquet::basic::Type::INT32) - .build() - .unwrap(), - ), - Arc::new( - SchemaType::primitive_type_builder("b", parquet::basic::Type::INT32) - .build() - .unwrap(), - ), - ]) - .build() - .unwrap(); - - Arc::new(SchemaDescriptor::new(Arc::new(schema))) - } - - /// Verifies that file splits respect byte ranges and only read specific row groups. - #[tokio::test] - async fn test_file_splits_respect_byte_ranges() { - use arrow_array::Int32Array; - use parquet::file::reader::{FileReader, SerializedFileReader}; - - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_path = format!("{table_location}/multi_row_group.parquet"); - - // Force each batch into its own row group for testing byte range filtering. - let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( - (0..100).collect::>(), - ))]) - .unwrap(); - let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( - (100..200).collect::>(), - ))]) - .unwrap(); - let batch3 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( - (200..300).collect::>(), - ))]) - .unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .set_max_row_group_row_count(Some(100)) - .build(); - - let file = File::create(&file_path).unwrap(); - let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); - writer.write(&batch1).expect("Writing batch 1"); - writer.write(&batch2).expect("Writing batch 2"); - writer.write(&batch3).expect("Writing batch 3"); - writer.close().unwrap(); - - // Read the file metadata to get row group byte positions - let file = File::open(&file_path).unwrap(); - let reader = SerializedFileReader::new(file).unwrap(); - let metadata = reader.metadata(); - - println!("File has {} row groups", metadata.num_row_groups()); - assert_eq!(metadata.num_row_groups(), 3, "Expected 3 row groups"); - - // Get byte positions for each row group - let row_group_0 = metadata.row_group(0); - let row_group_1 = metadata.row_group(1); - let row_group_2 = metadata.row_group(2); - - let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" - let rg1_start = rg0_start + row_group_0.compressed_size() as u64; - let rg2_start = rg1_start + row_group_1.compressed_size() as u64; - let file_end = rg2_start + row_group_2.compressed_size() as u64; - - println!( - "Row group 0: {} rows, starts at byte {}, {} bytes compressed", - row_group_0.num_rows(), - rg0_start, - row_group_0.compressed_size() - ); - println!( - "Row group 1: {} rows, starts at byte {}, {} bytes compressed", - row_group_1.num_rows(), - rg1_start, - row_group_1.compressed_size() - ); - println!( - "Row group 2: {} rows, starts at byte {}, {} bytes compressed", - row_group_2.num_rows(), - rg2_start, - row_group_2.compressed_size() - ); - - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - // Task 1: read only the first row group - let task1 = FileScanTask { - file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), - start: rg0_start, - length: row_group_0.compressed_size() as u64, - record_count: Some(100), - data_file_path: file_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - // Task 2: read the second and third row groups - let task2 = FileScanTask { - file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), - start: rg1_start, - length: file_end - rg1_start, - record_count: Some(200), - data_file_path: file_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - let tasks1 = Box::pin(futures::stream::iter(vec![Ok(task1)])) as FileScanTaskStream; - let result1 = reader - .clone() - .read(tasks1) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - let total_rows_task1: usize = result1.iter().map(|b| b.num_rows()).sum(); - println!( - "Task 1 (bytes {}-{}) returned {} rows", - rg0_start, - rg0_start + row_group_0.compressed_size() as u64, - total_rows_task1 - ); - - let tasks2 = Box::pin(futures::stream::iter(vec![Ok(task2)])) as FileScanTaskStream; - let result2 = reader - .read(tasks2) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - let total_rows_task2: usize = result2.iter().map(|b| b.num_rows()).sum(); - println!("Task 2 (bytes {rg1_start}-{file_end}) returned {total_rows_task2} rows"); - - assert_eq!( - total_rows_task1, 100, - "Task 1 should read only the first row group (100 rows), but got {total_rows_task1} rows" - ); - - assert_eq!( - total_rows_task2, 200, - "Task 2 should read only the second+third row groups (200 rows), but got {total_rows_task2} rows" - ); - - // Verify the actual data values are correct (not just the row count) - if total_rows_task1 > 0 { - let first_batch = &result1[0]; - let id_col = first_batch - .column(0) - .as_primitive::(); - let first_val = id_col.value(0); - let last_val = id_col.value(id_col.len() - 1); - println!("Task 1 data range: {first_val} to {last_val}"); - - assert_eq!(first_val, 0, "Task 1 should start with id=0"); - assert_eq!(last_val, 99, "Task 1 should end with id=99"); - } - - if total_rows_task2 > 0 { - let first_batch = &result2[0]; - let id_col = first_batch - .column(0) - .as_primitive::(); - let first_val = id_col.value(0); - println!("Task 2 first value: {first_val}"); - - assert_eq!(first_val, 100, "Task 2 should start with id=100, not id=0"); - } - } - - /// Test schema evolution: reading old Parquet file (with only column 'a') - /// using a newer table schema (with columns 'a' and 'b'). - /// This tests that: - /// 1. get_arrow_projection_mask allows missing columns - /// 2. RecordBatchTransformer adds missing column 'b' with NULL values - #[tokio::test] - async fn test_schema_evolution_add_column() { - use arrow_array::{Array, Int32Array}; - - // New table schema: columns 'a' and 'b' (b was added later, file only has 'a') - let new_schema = Arc::new( - Schema::builder() - .with_schema_id(2) - .with_fields(vec![ - NestedField::required(1, "a", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(2, "b", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - // Create Arrow schema for old Parquet file (only has column 'a') - let arrow_schema_old = Arc::new(ArrowSchema::new(vec![ - Field::new("a", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - // Write old Parquet file with only column 'a' - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let data_a = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef; - let to_write = RecordBatch::try_new(arrow_schema_old.clone(), vec![data_a]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - let file = File::create(format!("{table_location}/old_file.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - // Read the old Parquet file using the NEW schema (with column 'b') - let reader = ArrowReaderBuilder::new(file_io).build(); - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/old_file.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/old_file.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: new_schema.clone(), - project_field_ids: vec![1, 2], // Request both columns 'a' and 'b' - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Verify we got the correct data - assert_eq!(result.len(), 1); - let batch = &result[0]; - - // Should have 2 columns now - assert_eq!(batch.num_columns(), 2); - assert_eq!(batch.num_rows(), 3); - - // Column 'a' should have the original data - let col_a = batch - .column(0) - .as_primitive::(); - assert_eq!(col_a.values(), &[1, 2, 3]); - - // Column 'b' should be all NULLs (it didn't exist in the old file) - let col_b = batch - .column(1) - .as_primitive::(); - assert_eq!(col_b.null_count(), 3); - assert!(col_b.is_null(0)); - assert!(col_b.is_null(1)); - assert!(col_b.is_null(2)); - } - - /// Test for bug where position deletes in later row groups are not applied correctly. - /// - /// When a file has multiple row groups and a position delete targets a row in a later - /// row group, the `build_deletes_row_selection` function had a bug where it would - /// fail to increment `current_row_group_base_idx` when skipping row groups. - /// - /// This test creates: - /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) - /// - A position delete file that deletes row 199 (last row in second row group) - /// - /// Expected behavior: Should return 199 rows (with id=200 deleted) - /// Bug behavior: Returns 200 rows (delete is not applied) - /// - /// This bug was discovered while running Apache Spark + Apache Iceberg integration tests - /// through DataFusion Comet. The following Iceberg Java tests failed due to this bug: - /// - `org.apache.iceberg.spark.extensions.TestMergeOnReadDelete::testDeleteWithMultipleRowGroupsParquet` - /// - `org.apache.iceberg.spark.extensions.TestMergeOnReadUpdate::testUpdateWithMultipleRowGroupsParquet` - #[tokio::test] - async fn test_position_delete_across_multiple_row_groups() { - use arrow_array::{Int32Array, Int64Array}; - use parquet::file::reader::{FileReader, SerializedFileReader}; - - // Field IDs for positional delete schema - const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; - const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - - // Create table schema with a single 'id' column - let table_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - // Step 1: Create data file with 200 rows in 2 row groups - // Row group 0: rows 0-99 (ids 1-100) - // Row group 1: rows 100-199 (ids 101-200) - let data_file_path = format!("{table_location}/data.parquet"); - - let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(1..=100), - )]) - .unwrap(); - - let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(101..=200), - )]) - .unwrap(); - - // Force each batch into its own row group - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .set_max_row_group_row_count(Some(100)) - .build(); - - let file = File::create(&data_file_path).unwrap(); - let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); - writer.write(&batch1).expect("Writing batch 1"); - writer.write(&batch2).expect("Writing batch 2"); - writer.close().unwrap(); - - // Verify we created 2 row groups - let verify_file = File::open(&data_file_path).unwrap(); - let verify_reader = SerializedFileReader::new(verify_file).unwrap(); - assert_eq!( - verify_reader.metadata().num_row_groups(), - 2, - "Should have 2 row groups" - ); - - // Step 2: Create position delete file that deletes row 199 (id=200, last row in row group 1) - let delete_file_path = format!("{table_location}/deletes.parquet"); - - let delete_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), - )])), - Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_POS.to_string(), - )])), - ])); - - // Delete row at position 199 (0-indexed, so it's the last row: id=200) - let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ - Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), - Arc::new(Int64Array::from_iter_values(vec![199i64])), - ]) - .unwrap(); - - let delete_props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let delete_file = File::create(&delete_file_path).unwrap(); - let mut delete_writer = - ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); - delete_writer.write(&delete_batch).unwrap(); - delete_writer.close().unwrap(); - - // Step 3: Read the data file with the delete applied - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - let task = FileScanTask { - file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), - start: 0, - length: 0, - record_count: Some(200), - data_file_path: data_file_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: table_schema.clone(), - project_field_ids: vec![1], - predicate: None, - deletes: vec![FileScanTaskDeleteFile { - file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), - file_path: delete_file_path, - file_type: DataContentType::PositionDeletes, - partition_spec_id: 0, - equality_ids: None, - }], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Step 4: Verify we got 199 rows (not 200) - let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); - - println!("Total rows read: {total_rows}"); - println!("Expected: 199 rows (deleted row 199 which had id=200)"); - - // This assertion will FAIL before the fix and PASS after the fix - assert_eq!( - total_rows, 199, - "Expected 199 rows after deleting row 199, but got {total_rows} rows. \ - The bug causes position deletes in later row groups to be ignored." - ); - - // Verify the deleted row (id=200) is not present - let all_ids: Vec = result - .iter() - .flat_map(|batch| { - batch - .column(0) - .as_primitive::() - .values() - .iter() - .copied() - }) - .collect(); - - assert!( - !all_ids.contains(&200), - "Row with id=200 should be deleted but was found in results" - ); - - // Verify we have all other ids (1-199) - let expected_ids: Vec = (1..=199).collect(); - assert_eq!( - all_ids, expected_ids, - "Should have ids 1-199 but got different values" - ); - } - - /// Test for bug where position deletes are lost when skipping unselected row groups. - /// - /// This is a variant of `test_position_delete_across_multiple_row_groups` that exercises - /// the row group selection code path (`selected_row_groups: Some([...])`). - /// - /// When a file has multiple row groups and only some are selected for reading, - /// the `build_deletes_row_selection` function must correctly skip over deletes in - /// unselected row groups WITHOUT consuming deletes that belong to selected row groups. - /// - /// This test creates: - /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) - /// - A position delete file that deletes row 199 (last row in second row group) - /// - Row group selection that reads ONLY row group 1 (rows 100-199) - /// - /// Expected behavior: Should return 99 rows (with row 199 deleted) - /// Bug behavior: Returns 100 rows (delete is lost when skipping row group 0) - /// - /// The bug occurs when processing row group 0 (unselected): - /// ```rust - /// delete_vector_iter.advance_to(next_row_group_base_idx); // Position at first delete >= 100 - /// next_deleted_row_idx_opt = delete_vector_iter.next(); // BUG: Consumes delete at 199! - /// ``` - /// - /// The fix is to NOT call `next()` after `advance_to()` when skipping unselected row groups, - /// because `advance_to()` already positions the iterator correctly without consuming elements. - #[tokio::test] - async fn test_position_delete_with_row_group_selection() { - use arrow_array::{Int32Array, Int64Array}; - use parquet::file::reader::{FileReader, SerializedFileReader}; - - // Field IDs for positional delete schema - const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; - const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - - // Create table schema with a single 'id' column - let table_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - // Step 1: Create data file with 200 rows in 2 row groups - // Row group 0: rows 0-99 (ids 1-100) - // Row group 1: rows 100-199 (ids 101-200) - let data_file_path = format!("{table_location}/data.parquet"); - - let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(1..=100), - )]) - .unwrap(); - - let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(101..=200), - )]) - .unwrap(); - - // Force each batch into its own row group - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .set_max_row_group_row_count(Some(100)) - .build(); - - let file = File::create(&data_file_path).unwrap(); - let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); - writer.write(&batch1).expect("Writing batch 1"); - writer.write(&batch2).expect("Writing batch 2"); - writer.close().unwrap(); - - // Verify we created 2 row groups - let verify_file = File::open(&data_file_path).unwrap(); - let verify_reader = SerializedFileReader::new(verify_file).unwrap(); - assert_eq!( - verify_reader.metadata().num_row_groups(), - 2, - "Should have 2 row groups" - ); - - // Step 2: Create position delete file that deletes row 199 (id=200, last row in row group 1) - let delete_file_path = format!("{table_location}/deletes.parquet"); - - let delete_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), - )])), - Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_POS.to_string(), - )])), - ])); - - // Delete row at position 199 (0-indexed, so it's the last row: id=200) - let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ - Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), - Arc::new(Int64Array::from_iter_values(vec![199i64])), - ]) - .unwrap(); - - let delete_props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let delete_file = File::create(&delete_file_path).unwrap(); - let mut delete_writer = - ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); - delete_writer.write(&delete_batch).unwrap(); - delete_writer.close().unwrap(); - - // Step 3: Get byte ranges to read ONLY row group 1 (rows 100-199) - // This exercises the row group selection code path where row group 0 is skipped - let metadata_file = File::open(&data_file_path).unwrap(); - let metadata_reader = SerializedFileReader::new(metadata_file).unwrap(); - let metadata = metadata_reader.metadata(); - - let row_group_0 = metadata.row_group(0); - let row_group_1 = metadata.row_group(1); - - let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" - let rg1_start = rg0_start + row_group_0.compressed_size() as u64; - let rg1_length = row_group_1.compressed_size() as u64; - - println!( - "Row group 0: starts at byte {}, {} bytes compressed", - rg0_start, - row_group_0.compressed_size() - ); - println!( - "Row group 1: starts at byte {}, {} bytes compressed", - rg1_start, - row_group_1.compressed_size() - ); - - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - // Create FileScanTask that reads ONLY row group 1 via byte range filtering - let task = FileScanTask { - file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), - start: rg1_start, - length: rg1_length, - record_count: Some(100), // Row group 1 has 100 rows - data_file_path: data_file_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: table_schema.clone(), - project_field_ids: vec![1], - predicate: None, - deletes: vec![FileScanTaskDeleteFile { - file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), - file_path: delete_file_path, - file_type: DataContentType::PositionDeletes, - partition_spec_id: 0, - equality_ids: None, - }], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Step 4: Verify we got 99 rows (not 100) - // Row group 1 has 100 rows (ids 101-200), minus 1 delete (id=200) = 99 rows - let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); - - println!("Total rows read from row group 1: {total_rows}"); - println!("Expected: 99 rows (row group 1 has 100 rows, 1 delete at position 199)"); - - // This assertion will FAIL before the fix and PASS after the fix - assert_eq!( - total_rows, 99, - "Expected 99 rows from row group 1 after deleting position 199, but got {total_rows} rows. \ - The bug causes position deletes to be lost when advance_to() is followed by next() \ - when skipping unselected row groups." - ); - - // Verify the deleted row (id=200) is not present - let all_ids: Vec = result - .iter() - .flat_map(|batch| { - batch - .column(0) - .as_primitive::() - .values() - .iter() - .copied() - }) - .collect(); - - assert!( - !all_ids.contains(&200), - "Row with id=200 should be deleted but was found in results" - ); - - // Verify we have ids 101-199 (not 101-200) - let expected_ids: Vec = (101..=199).collect(); - assert_eq!( - all_ids, expected_ids, - "Should have ids 101-199 but got different values" - ); - } - /// Test for bug where stale cached delete causes infinite loop when skipping row groups. - /// - /// This test exposes the inverse scenario of `test_position_delete_with_row_group_selection`: - /// - Position delete targets a row in the SKIPPED row group (not the selected one) - /// - After calling advance_to(), the cached delete index is stale - /// - Without updating the cache, the code enters an infinite loop - /// - /// This test creates: - /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) - /// - A position delete file that deletes row 0 (first row in SKIPPED row group 0) - /// - Row group selection that reads ONLY row group 1 (rows 100-199) - /// - /// The bug occurs when skipping row group 0: - /// ```rust - /// let mut next_deleted_row_idx_opt = delete_vector_iter.next(); // Some(0) - /// // ... skip to row group 1 ... - /// delete_vector_iter.advance_to(100); // Iterator advances past delete at 0 - /// // BUG: next_deleted_row_idx_opt is still Some(0) - STALE! - /// // When processing row group 1: - /// // current_idx = 100, next_deleted_row_idx = 0, next_row_group_base_idx = 200 - /// // Loop condition: 0 < 200 (true) - /// // But: current_idx (100) > next_deleted_row_idx (0) - /// // And: current_idx (100) != next_deleted_row_idx (0) - /// // Neither branch executes -> INFINITE LOOP! - /// ``` - /// - /// Expected behavior: Should return 100 rows (delete at 0 doesn't affect row group 1) - /// Bug behavior: Infinite loop in build_deletes_row_selection - #[tokio::test] - async fn test_position_delete_in_skipped_row_group() { - use arrow_array::{Int32Array, Int64Array}; - use parquet::file::reader::{FileReader, SerializedFileReader}; - - // Field IDs for positional delete schema - const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; - const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - - // Create table schema with a single 'id' column - let table_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - // Step 1: Create data file with 200 rows in 2 row groups - // Row group 0: rows 0-99 (ids 1-100) - // Row group 1: rows 100-199 (ids 101-200) - let data_file_path = format!("{table_location}/data.parquet"); - - let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(1..=100), - )]) - .unwrap(); - - let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(101..=200), - )]) - .unwrap(); - - // Force each batch into its own row group - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .set_max_row_group_row_count(Some(100)) - .build(); - - let file = File::create(&data_file_path).unwrap(); - let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); - writer.write(&batch1).expect("Writing batch 1"); - writer.write(&batch2).expect("Writing batch 2"); - writer.close().unwrap(); - - // Verify we created 2 row groups - let verify_file = File::open(&data_file_path).unwrap(); - let verify_reader = SerializedFileReader::new(verify_file).unwrap(); - assert_eq!( - verify_reader.metadata().num_row_groups(), - 2, - "Should have 2 row groups" - ); - - // Step 2: Create position delete file that deletes row 0 (id=1, first row in row group 0) - let delete_file_path = format!("{table_location}/deletes.parquet"); - - let delete_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), - )])), - Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_POS.to_string(), - )])), - ])); - - // Delete row at position 0 (0-indexed, so it's the first row: id=1) - let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ - Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), - Arc::new(Int64Array::from_iter_values(vec![0i64])), - ]) - .unwrap(); - - let delete_props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let delete_file = File::create(&delete_file_path).unwrap(); - let mut delete_writer = - ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); - delete_writer.write(&delete_batch).unwrap(); - delete_writer.close().unwrap(); - - // Step 3: Get byte ranges to read ONLY row group 1 (rows 100-199) - // This exercises the row group selection code path where row group 0 is skipped - let metadata_file = File::open(&data_file_path).unwrap(); - let metadata_reader = SerializedFileReader::new(metadata_file).unwrap(); - let metadata = metadata_reader.metadata(); - - let row_group_0 = metadata.row_group(0); - let row_group_1 = metadata.row_group(1); - - let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" - let rg1_start = rg0_start + row_group_0.compressed_size() as u64; - let rg1_length = row_group_1.compressed_size() as u64; - - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - // Create FileScanTask that reads ONLY row group 1 via byte range filtering - let task = FileScanTask { - file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), - start: rg1_start, - length: rg1_length, - record_count: Some(100), // Row group 1 has 100 rows - data_file_path: data_file_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: table_schema.clone(), - project_field_ids: vec![1], - predicate: None, - deletes: vec![FileScanTaskDeleteFile { - file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), - file_path: delete_file_path, - file_type: DataContentType::PositionDeletes, - partition_spec_id: 0, - equality_ids: None, - }], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Step 4: Verify we got 100 rows (all of row group 1) - // The delete at position 0 is in row group 0, which is skipped, so it doesn't affect us - let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); - - assert_eq!( - total_rows, 100, - "Expected 100 rows from row group 1 (delete at position 0 is in skipped row group 0). \ - If this hangs or fails, it indicates the cached delete index was not updated after advance_to()." - ); - - // Verify we have all ids from row group 1 (101-200) - let all_ids: Vec = result - .iter() - .flat_map(|batch| { - batch - .column(0) - .as_primitive::() - .values() - .iter() - .copied() - }) - .collect(); - - let expected_ids: Vec = (101..=200).collect(); - assert_eq!( - all_ids, expected_ids, - "Should have ids 101-200 (all of row group 1)" - ); - } - - /// Test reading Parquet files without field ID metadata (e.g., migrated tables). - /// This exercises the position-based fallback path. - /// - /// Corresponds to Java's ParquetSchemaUtil.addFallbackIds() + pruneColumnsFallback() - /// in /parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java - #[tokio::test] - async fn test_read_parquet_file_without_field_ids() { - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(2, "age", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - // Parquet file from a migrated table - no field ID metadata - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("age", DataType::Int32, false), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let name_data = vec!["Alice", "Bob", "Charlie"]; - let age_data = vec![30, 25, 35]; - - use arrow_array::Int32Array; - let name_col = Arc::new(StringArray::from(name_data.clone())) as ArrayRef; - let age_col = Arc::new(Int32Array::from(age_data.clone())) as ArrayRef; - - let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![name_col, age_col]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert_eq!(result.len(), 1); - let batch = &result[0]; - assert_eq!(batch.num_rows(), 3); - assert_eq!(batch.num_columns(), 2); - - // Verify position-based mapping: field_id 1 → position 0, field_id 2 → position 1 - let name_array = batch.column(0).as_string::(); - assert_eq!(name_array.value(0), "Alice"); - assert_eq!(name_array.value(1), "Bob"); - assert_eq!(name_array.value(2), "Charlie"); - - let age_array = batch - .column(1) - .as_primitive::(); - assert_eq!(age_array.value(0), 30); - assert_eq!(age_array.value(1), 25); - assert_eq!(age_array.value(2), 35); - } - - /// Test reading Parquet files without field IDs with partial projection. - /// Only a subset of columns are requested, verifying position-based fallback - /// handles column selection correctly. - #[tokio::test] - async fn test_read_parquet_without_field_ids_partial_projection() { - use arrow_array::Int32Array; - - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "col1", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(2, "col2", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required(3, "col3", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(4, "col4", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("col1", DataType::Utf8, false), - Field::new("col2", DataType::Int32, false), - Field::new("col3", DataType::Utf8, false), - Field::new("col4", DataType::Int32, false), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let col1_data = Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef; - let col2_data = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef; - let col3_data = Arc::new(StringArray::from(vec!["c", "d"])) as ArrayRef; - let col4_data = Arc::new(Int32Array::from(vec![30, 40])) as ArrayRef; - - let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![ - col1_data, col2_data, col3_data, col4_data, - ]) - .unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 3], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert_eq!(result.len(), 1); - let batch = &result[0]; - assert_eq!(batch.num_rows(), 2); - assert_eq!(batch.num_columns(), 2); - - let col1_array = batch.column(0).as_string::(); - assert_eq!(col1_array.value(0), "a"); - assert_eq!(col1_array.value(1), "b"); - - let col3_array = batch.column(1).as_string::(); - assert_eq!(col3_array.value(0), "c"); - assert_eq!(col3_array.value(1), "d"); - } - - /// Test reading Parquet files without field IDs with schema evolution. - /// The Iceberg schema has more fields than the Parquet file, testing that - /// missing columns are filled with NULLs. - #[tokio::test] - async fn test_read_parquet_without_field_ids_schema_evolution() { - use arrow_array::{Array, Int32Array}; - - // Schema with field 3 added after the file was written - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(2, "age", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(3, "city", Type::Primitive(PrimitiveType::String)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("age", DataType::Int32, false), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let name_data = Arc::new(StringArray::from(vec!["Alice", "Bob"])) as ArrayRef; - let age_data = Arc::new(Int32Array::from(vec![30, 25])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema.clone(), vec![name_data, age_data]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2, 3], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert_eq!(result.len(), 1); - let batch = &result[0]; - assert_eq!(batch.num_rows(), 2); - assert_eq!(batch.num_columns(), 3); - - let name_array = batch.column(0).as_string::(); - assert_eq!(name_array.value(0), "Alice"); - assert_eq!(name_array.value(1), "Bob"); - - let age_array = batch - .column(1) - .as_primitive::(); - assert_eq!(age_array.value(0), 30); - assert_eq!(age_array.value(1), 25); - - // Verify missing column filled with NULLs - let city_array = batch.column(2).as_string::(); - assert_eq!(city_array.null_count(), 2); - assert!(city_array.is_null(0)); - assert!(city_array.is_null(1)); - } - - /// Test reading Parquet files without field IDs that have multiple row groups. - /// This ensures the position-based fallback works correctly across row group boundaries. - #[tokio::test] - async fn test_read_parquet_without_field_ids_multiple_row_groups() { - use arrow_array::Int32Array; - - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(2, "value", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("value", DataType::Int32, false), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - // Small row group size to create multiple row groups - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .set_write_batch_size(2) - .set_max_row_group_row_count(Some(2)) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); - - // Write 6 rows in 3 batches (will create 3 row groups) - for batch_num in 0..3 { - let name_data = Arc::new(StringArray::from(vec![ - format!("name_{}", batch_num * 2), - format!("name_{}", batch_num * 2 + 1), - ])) as ArrayRef; - let value_data = - Arc::new(Int32Array::from(vec![batch_num * 2, batch_num * 2 + 1])) as ArrayRef; - - let batch = - RecordBatch::try_new(arrow_schema.clone(), vec![name_data, value_data]).unwrap(); - writer.write(&batch).expect("Writing batch"); - } - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert!(!result.is_empty()); - - let mut all_names = Vec::new(); - let mut all_values = Vec::new(); - - for batch in &result { - let name_array = batch.column(0).as_string::(); - let value_array = batch - .column(1) - .as_primitive::(); - - for i in 0..batch.num_rows() { - all_names.push(name_array.value(i).to_string()); - all_values.push(value_array.value(i)); - } - } - - assert_eq!(all_names.len(), 6); - assert_eq!(all_values.len(), 6); - - for i in 0..6 { - assert_eq!(all_names[i], format!("name_{i}")); - assert_eq!(all_values[i], i as i32); - } - } - - /// Test reading Parquet files without field IDs with nested types (struct). - /// Java's pruneColumnsFallback() projects entire top-level columns including nested content. - /// This test verifies that a top-level struct field is projected correctly with all its nested fields. - #[tokio::test] - async fn test_read_parquet_without_field_ids_with_struct() { - use arrow_array::{Int32Array, StructArray}; - use arrow_schema::Fields; - - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required( - 2, - "person", - Type::Struct(crate::spec::StructType::new(vec![ - NestedField::required( - 3, - "name", - Type::Primitive(PrimitiveType::String), - ) - .into(), - NestedField::required(4, "age", Type::Primitive(PrimitiveType::Int)) - .into(), - ])), - ) - .into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new( - "person", - DataType::Struct(Fields::from(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("age", DataType::Int32, false), - ])), - false, - ), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let id_data = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef; - let name_data = Arc::new(StringArray::from(vec!["Alice", "Bob"])) as ArrayRef; - let age_data = Arc::new(Int32Array::from(vec![30, 25])) as ArrayRef; - let person_data = Arc::new(StructArray::from(vec![ - ( - Arc::new(Field::new("name", DataType::Utf8, false)), - name_data, - ), - ( - Arc::new(Field::new("age", DataType::Int32, false)), - age_data, - ), - ])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema.clone(), vec![id_data, person_data]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert_eq!(result.len(), 1); - let batch = &result[0]; - assert_eq!(batch.num_rows(), 2); - assert_eq!(batch.num_columns(), 2); - - let id_array = batch - .column(0) - .as_primitive::(); - assert_eq!(id_array.value(0), 1); - assert_eq!(id_array.value(1), 2); - - let person_array = batch.column(1).as_struct(); - assert_eq!(person_array.num_columns(), 2); - - let name_array = person_array.column(0).as_string::(); - assert_eq!(name_array.value(0), "Alice"); - assert_eq!(name_array.value(1), "Bob"); - - let age_array = person_array - .column(1) - .as_primitive::(); - assert_eq!(age_array.value(0), 30); - assert_eq!(age_array.value(1), 25); - } - - /// Test reading Parquet files without field IDs with schema evolution - column added in the middle. - /// When a new column is inserted between existing columns in the schema order, - /// the fallback projection must correctly map field IDs to output positions. - #[tokio::test] - async fn test_read_parquet_without_field_ids_schema_evolution_add_column_in_middle() { - use arrow_array::{Array, Int32Array}; - - let arrow_schema_old = Arc::new(ArrowSchema::new(vec![ - Field::new("col0", DataType::Int32, true), - Field::new("col1", DataType::Int32, true), - ])); - - // New column added between existing columns: col0 (id=1), newCol (id=5), col1 (id=2) - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional(1, "col0", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(5, "newCol", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(2, "col1", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let col0_data = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef; - let col1_data = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema_old.clone(), vec![col0_data, col1_data]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 5, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert_eq!(result.len(), 1); - let batch = &result[0]; - assert_eq!(batch.num_rows(), 2); - assert_eq!(batch.num_columns(), 3); - - let result_col0 = batch - .column(0) - .as_primitive::(); - assert_eq!(result_col0.value(0), 1); - assert_eq!(result_col0.value(1), 2); - - // New column should be NULL (doesn't exist in old file) - let result_newcol = batch - .column(1) - .as_primitive::(); - assert_eq!(result_newcol.null_count(), 2); - assert!(result_newcol.is_null(0)); - assert!(result_newcol.is_null(1)); - - let result_col1 = batch - .column(2) - .as_primitive::(); - assert_eq!(result_col1.value(0), 10); - assert_eq!(result_col1.value(1), 20); - } - - /// Test reading Parquet files without field IDs with a filter that eliminates all row groups. - /// During development of field ID mapping, we saw a panic when row_selection_enabled=true and - /// all row groups are filtered out. - #[tokio::test] - async fn test_read_parquet_without_field_ids_filter_eliminates_all_rows() { - use arrow_array::{Float64Array, Int32Array}; - - // Schema with fields that will use fallback IDs 1, 2, 3 - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(3, "value", Type::Primitive(PrimitiveType::Double)) - .into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, false), - Field::new("value", DataType::Float64, false), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - // Write data where all ids are >= 10 - let id_data = Arc::new(Int32Array::from(vec![10, 11, 12])) as ArrayRef; - let name_data = Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef; - let value_data = Arc::new(Float64Array::from(vec![100.0, 200.0, 300.0])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema.clone(), vec![id_data, name_data, value_data]) - .unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - // Filter that eliminates all row groups: id < 5 - let predicate = Reference::new("id").less_than(Datum::int(5)); - - // Enable both row_group_filtering and row_selection - triggered the panic - let reader = ArrowReaderBuilder::new(file_io) - .with_row_group_filtering_enabled(true) - .with_row_selection_enabled(true) - .build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2, 3], - predicate: Some(predicate.bind(schema, true).unwrap()), - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - // Should no longer panic - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Should return empty results - assert!(result.is_empty() || result.iter().all(|batch| batch.num_rows() == 0)); - } - - /// Test that concurrency=1 reads all files correctly and in deterministic order. - /// This verifies the fast-path optimization for single concurrency. - #[tokio::test] - async fn test_read_with_concurrency_one() { - use arrow_array::Int32Array; - - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required(2, "file_num", Type::Primitive(PrimitiveType::Int)) - .into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - Field::new("file_num", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "2".to_string(), - )])), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - // Create 3 parquet files with different data - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - for file_num in 0..3 { - let id_data = Arc::new(Int32Array::from_iter_values( - file_num * 10..(file_num + 1) * 10, - )) as ArrayRef; - let file_num_data = Arc::new(Int32Array::from(vec![file_num; 10])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema.clone(), vec![id_data, file_num_data]).unwrap(); - - let file = File::create(format!("{table_location}/file_{file_num}.parquet")).unwrap(); - let mut writer = - ArrowWriter::try_new(file, to_write.schema(), Some(props.clone())).unwrap(); - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - } - - // Read with concurrency=1 (fast-path) - let reader = ArrowReaderBuilder::new(file_io) - .with_data_file_concurrency_limit(1) - .build(); - - // Create tasks in a specific order: file_0, file_1, file_2 - let tasks = vec![ - Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_0.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/file_0.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }), - Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/file_1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }), - Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_2.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/file_2.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }), - ]; - - let tasks_stream = Box::pin(futures::stream::iter(tasks)) as FileScanTaskStream; - - let result = reader - .read(tasks_stream) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Verify we got all 30 rows (10 from each file) - let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); - assert_eq!(total_rows, 30, "Should have 30 total rows"); - - // Collect all ids and file_nums to verify data - let mut all_ids = Vec::new(); - let mut all_file_nums = Vec::new(); - - for batch in &result { - let id_col = batch - .column(0) - .as_primitive::(); - let file_num_col = batch - .column(1) - .as_primitive::(); - - for i in 0..batch.num_rows() { - all_ids.push(id_col.value(i)); - all_file_nums.push(file_num_col.value(i)); - } - } - - assert_eq!(all_ids.len(), 30); - assert_eq!(all_file_nums.len(), 30); - - // With concurrency=1 and sequential processing, files should be processed in order - // file_0: ids 0-9, file_num=0 - // file_1: ids 10-19, file_num=1 - // file_2: ids 20-29, file_num=2 - for i in 0..10 { - assert_eq!(all_file_nums[i], 0, "First 10 rows should be from file_0"); - assert_eq!(all_ids[i], i as i32, "IDs should be 0-9"); - } - for i in 10..20 { - assert_eq!(all_file_nums[i], 1, "Next 10 rows should be from file_1"); - assert_eq!(all_ids[i], i as i32, "IDs should be 10-19"); - } - for i in 20..30 { - assert_eq!(all_file_nums[i], 2, "Last 10 rows should be from file_2"); - assert_eq!(all_ids[i], i as i32, "IDs should be 20-29"); - } - } - - /// Test bucket partitioning reads source column from data file (not partition metadata). - /// - /// This is an integration test verifying the complete ArrowReader pipeline with bucket partitioning. - /// It corresponds to TestRuntimeFiltering tests in Iceberg Java (e.g., testRenamedSourceColumnTable). - /// - /// # Iceberg Spec Requirements - /// - /// Per the Iceberg spec "Column Projection" section: - /// > "Return the value from partition metadata if an **Identity Transform** exists for the field" - /// - /// This means: - /// - Identity transforms (e.g., `identity(dept)`) use constants from partition metadata - /// - Non-identity transforms (e.g., `bucket(4, id)`) must read source columns from data files - /// - Partition metadata for bucket transforms stores bucket numbers (0-3), NOT source values - /// - /// Java's PartitionUtil.constantsMap() implements this via: - /// ```java - /// if (field.transform().isIdentity()) { - /// idToConstant.put(field.sourceId(), converted); - /// } - /// ``` - /// - /// # What This Test Verifies - /// - /// This test ensures the full ArrowReader → RecordBatchTransformer pipeline correctly handles - /// bucket partitioning when FileScanTask provides partition_spec and partition_data: - /// - /// - Parquet file has field_id=1 named "id" with actual data [1, 5, 9, 13] - /// - FileScanTask specifies partition_spec with bucket(4, id) and partition_data with bucket=1 - /// - RecordBatchTransformer.constants_map() excludes bucket-partitioned field from constants - /// - ArrowReader correctly reads [1, 5, 9, 13] from the data file - /// - Values are NOT replaced with constant 1 from partition metadata - /// - /// # Why This Matters - /// - /// Without correct handling: - /// - Runtime filtering would break (e.g., `WHERE id = 5` would fail) - /// - Query results would be incorrect (all rows would have id=1) - /// - Bucket partitioning would be unusable for query optimization - /// - /// # References - /// - Iceberg spec: format/spec.md "Column Projection" + "Partition Transforms" - /// - Java test: spark/src/test/java/.../TestRuntimeFiltering.java - /// - Java impl: core/src/main/java/org/apache/iceberg/util/PartitionUtil.java - #[tokio::test] - async fn test_bucket_partitioning_reads_source_column_from_file() { - use arrow_array::Int32Array; - - use crate::spec::{Literal, PartitionSpec, Struct, Transform}; - - // Iceberg schema with id and name columns - let schema = Arc::new( - Schema::builder() - .with_schema_id(0) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(2, "name", Type::Primitive(PrimitiveType::String)).into(), - ]) - .build() - .unwrap(), - ); - - // Partition spec: bucket(4, id) - let partition_spec = Arc::new( - PartitionSpec::builder(schema.clone()) - .with_spec_id(0) - .add_partition_field("id", "id_bucket", Transform::Bucket(4)) - .unwrap() - .build() - .unwrap(), - ); - - // Partition data: bucket value is 1 - let partition_data = Struct::from_iter(vec![Some(Literal::int(1))]); - - // Create Arrow schema with field IDs for Parquet file - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - Field::new("name", DataType::Utf8, true).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "2".to_string(), - )])), - ])); - - // Write Parquet file with data - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let id_data = Arc::new(Int32Array::from(vec![1, 5, 9, 13])) as ArrayRef; - let name_data = - Arc::new(StringArray::from(vec!["Alice", "Bob", "Charlie", "Dave"])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema.clone(), vec![id_data, name_data]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - let file = File::create(format!("{}/data.parquet", &table_location)).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - // Read the Parquet file with partition spec and data - let reader = ArrowReaderBuilder::new(file_io).build(); - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/data.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/data.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: Some(partition_data), - partition_spec: Some(partition_spec), - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Verify we got the correct data - assert_eq!(result.len(), 1); - let batch = &result[0]; - - assert_eq!(batch.num_columns(), 2); - assert_eq!(batch.num_rows(), 4); - - // The id column MUST contain actual values from the Parquet file [1, 5, 9, 13], - // NOT the constant partition value 1 - let id_col = batch - .column(0) - .as_primitive::(); - assert_eq!(id_col.value(0), 1); - assert_eq!(id_col.value(1), 5); - assert_eq!(id_col.value(2), 9); - assert_eq!(id_col.value(3), 13); - - let name_col = batch.column(1).as_string::(); - assert_eq!(name_col.value(0), "Alice"); - assert_eq!(name_col.value(1), "Bob"); - assert_eq!(name_col.value(2), "Charlie"); - assert_eq!(name_col.value(3), "Dave"); - } - - #[test] - fn test_merge_ranges_empty() { - assert_eq!(super::merge_ranges(&[], 1024), Vec::>::new()); - } - - #[test] - fn test_merge_ranges_no_coalesce() { - // Ranges far apart should not be merged - let ranges = vec![0..100, 1_000_000..1_000_100]; - let merged = super::merge_ranges(&ranges, 1024); - assert_eq!(merged, vec![0..100, 1_000_000..1_000_100]); - } - - #[test] - fn test_merge_ranges_coalesce() { - // Ranges within the gap threshold should be merged - let ranges = vec![0..100, 200..300, 500..600]; - let merged = super::merge_ranges(&ranges, 1024); - assert_eq!(merged, vec![0..600]); - } - - #[test] - fn test_merge_ranges_overlapping() { - let ranges = vec![0..200, 100..300]; - let merged = super::merge_ranges(&ranges, 0); - assert_eq!(merged, vec![0..300]); - } - - #[test] - fn test_merge_ranges_unsorted() { - let ranges = vec![500..600, 0..100, 200..300]; - let merged = super::merge_ranges(&ranges, 1024); - assert_eq!(merged, vec![0..600]); - } - - /// Mock FileRead backed by a flat byte buffer. - struct MockFileRead { - data: bytes::Bytes, - } - - impl MockFileRead { - fn new(size: usize) -> Self { - // Fill with sequential byte values so slices are verifiable. - let data: Vec = (0..size).map(|i| (i % 256) as u8).collect(); - Self { - data: bytes::Bytes::from(data), - } - } - } - - #[async_trait::async_trait] - impl crate::io::FileRead for MockFileRead { - async fn read(&self, range: Range) -> crate::Result { - Ok(self.data.slice(range.start as usize..range.end as usize)) - } - } - - #[tokio::test] - async fn test_get_byte_ranges_no_coalesce() { - use parquet::arrow::async_reader::AsyncFileReader; - - let mock = MockFileRead::new(2048); - let expected_0 = mock.data.slice(0..100); - let expected_1 = mock.data.slice(1500..1600); - - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 2048 }, Box::new(mock)) - .with_parquet_read_options( - super::ParquetReadOptions::builder() - .with_range_coalesce_bytes(0) - .build(), - ); - - let result = reader - .get_byte_ranges(vec![0..100, 1500..1600]) - .await - .unwrap(); - - assert_eq!(result.len(), 2); - assert_eq!(result[0], expected_0); - assert_eq!(result[1], expected_1); - } - - #[tokio::test] - async fn test_get_byte_ranges_with_coalesce() { - use parquet::arrow::async_reader::AsyncFileReader; - - let mock = MockFileRead::new(1024); - let expected_0 = mock.data.slice(0..100); - let expected_1 = mock.data.slice(200..300); - let expected_2 = mock.data.slice(500..600); - - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 1024 }, Box::new(mock)) - .with_parquet_read_options( - super::ParquetReadOptions::builder() - .with_range_coalesce_bytes(1024) - .build(), - ); - - // All ranges within coalesce threshold — should merge into one fetch. - let result = reader - .get_byte_ranges(vec![0..100, 200..300, 500..600]) - .await - .unwrap(); - - assert_eq!(result.len(), 3); - assert_eq!(result[0], expected_0); - assert_eq!(result[1], expected_1); - assert_eq!(result[2], expected_2); - } - - #[tokio::test] - async fn test_get_byte_ranges_empty() { - use parquet::arrow::async_reader::AsyncFileReader; - - let mock = MockFileRead::new(1024); - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 1024 }, Box::new(mock)); - - let result = reader.get_byte_ranges(vec![]).await.unwrap(); - assert!(result.is_empty()); - } - - #[tokio::test] - async fn test_get_byte_ranges_coalesce_max() { - use parquet::arrow::async_reader::AsyncFileReader; - - let mock = MockFileRead::new(2048); - let expected_0 = mock.data.slice(0..100); - let expected_1 = mock.data.slice(1500..1600); - - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 2048 }, Box::new(mock)) - .with_parquet_read_options( - super::ParquetReadOptions::builder() - .with_range_coalesce_bytes(u64::MAX) - .build(), - ); - - // u64::MAX coalesce — all ranges merge into a single fetch. - let result = reader - .get_byte_ranges(vec![0..100, 1500..1600]) - .await - .unwrap(); - - assert_eq!(result.len(), 2); - assert_eq!(result[0], expected_0); - assert_eq!(result[1], expected_1); - } - - #[tokio::test] - async fn test_get_byte_ranges_concurrency_zero() { - use parquet::arrow::async_reader::AsyncFileReader; - - // concurrency=0 is clamped to 1, so this should not hang. - let mock = MockFileRead::new(1024); - let expected = mock.data.slice(0..100); - - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 1024 }, Box::new(mock)) - .with_parquet_read_options( - super::ParquetReadOptions::builder() - .with_range_fetch_concurrency(0) - .build(), - ); - - let result = reader - .get_byte_ranges(vec![0..100, 200..300]) - .await - .unwrap(); - assert_eq!(result.len(), 2); - assert_eq!(result[0], expected); - } - - #[tokio::test] - async fn test_get_byte_ranges_concurrency_one() { - use parquet::arrow::async_reader::AsyncFileReader; - - let mock = MockFileRead::new(2048); - let expected_0 = mock.data.slice(0..100); - let expected_1 = mock.data.slice(500..600); - let expected_2 = mock.data.slice(1500..1600); - - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 2048 }, Box::new(mock)) - .with_parquet_read_options( - super::ParquetReadOptions::builder() - .with_range_coalesce_bytes(0) - .with_range_fetch_concurrency(1) - .build(), - ); - - // concurrency=1 with no coalescing — sequential fetches. - let result = reader - .get_byte_ranges(vec![0..100, 500..600, 1500..1600]) - .await - .unwrap(); - - assert_eq!(result.len(), 3); - assert_eq!(result[0], expected_0); - assert_eq!(result[1], expected_1); - assert_eq!(result[2], expected_2); - } -} diff --git a/crates/iceberg/src/arrow/reader/file_reader.rs b/crates/iceberg/src/arrow/reader/file_reader.rs new file mode 100644 index 0000000000..79fbcc7960 --- /dev/null +++ b/crates/iceberg/src/arrow/reader/file_reader.rs @@ -0,0 +1,368 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Async Parquet file reader that adapts an Iceberg `FileRead` to parquet's `AsyncFileReader`. + +use std::ops::Range; +use std::sync::Arc; + +use bytes::Bytes; +use futures::future::BoxFuture; +use futures::{FutureExt, StreamExt, TryFutureExt, TryStreamExt}; +use parquet::arrow::arrow_reader::ArrowReaderOptions; +use parquet::arrow::async_reader::AsyncFileReader; +use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader}; + +use super::ParquetReadOptions; +use crate::io::{FileMetadata, FileRead}; + +/// ArrowFileReader is a wrapper around a FileRead that impls parquets AsyncFileReader. +pub struct ArrowFileReader { + meta: FileMetadata, + parquet_read_options: ParquetReadOptions, + r: Box, +} + +impl ArrowFileReader { + /// Create a new ArrowFileReader + pub fn new(meta: FileMetadata, r: Box) -> Self { + Self { + meta, + parquet_read_options: ParquetReadOptions::builder().build(), + r, + } + } + + /// Configure all Parquet read options. + pub(crate) fn with_parquet_read_options(mut self, options: ParquetReadOptions) -> Self { + self.parquet_read_options = options; + self + } +} + +impl AsyncFileReader for ArrowFileReader { + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, parquet::errors::Result> { + Box::pin( + self.r + .read(range.start..range.end) + .map_err(|err| parquet::errors::ParquetError::External(Box::new(err))), + ) + } + + /// Override the default `get_byte_ranges` which calls `get_bytes` sequentially. + /// The parquet reader calls this to fetch column chunks for a row group, so + /// without this override each column chunk is a serial round-trip to object storage. + /// Adapted from object_store's `coalesce_ranges` in `util.rs`. + fn get_byte_ranges( + &mut self, + ranges: Vec>, + ) -> BoxFuture<'_, parquet::errors::Result>> { + let coalesce_bytes = self.parquet_read_options.range_coalesce_bytes(); + let concurrency = self.parquet_read_options.range_fetch_concurrency().max(1); + + async move { + // Merge nearby ranges to reduce the number of object store requests. + let fetch_ranges = merge_ranges(&ranges, coalesce_bytes); + let r = &self.r; + + // Fetch merged ranges concurrently. + let fetched: Vec = futures::stream::iter(fetch_ranges.iter().cloned()) + .map(|range| async move { + r.read(range) + .await + .map_err(|e| parquet::errors::ParquetError::External(Box::new(e))) + }) + .buffered(concurrency) + .try_collect() + .await?; + + // Slice the fetched data back into the originally requested ranges. + Ok(ranges + .iter() + .map(|range| { + let idx = fetch_ranges.partition_point(|v| v.start <= range.start) - 1; + let fetch_range = &fetch_ranges[idx]; + let fetch_bytes = &fetched[idx]; + let start = (range.start - fetch_range.start) as usize; + let end = (range.end - fetch_range.start) as usize; + fetch_bytes.slice(start..end.min(fetch_bytes.len())) + }) + .collect()) + } + .boxed() + } + + // TODO: currently we don't respect `ArrowReaderOptions` cause it don't expose any method to access the option field + // we will fix it after `v55.1.0` is released in https://github.com/apache/arrow-rs/issues/7393 + fn get_metadata( + &mut self, + _options: Option<&'_ ArrowReaderOptions>, + ) -> BoxFuture<'_, parquet::errors::Result>> { + async move { + let reader = ParquetMetaDataReader::new() + .with_prefetch_hint(self.parquet_read_options.metadata_size_hint()) + // Set the page policy first because it updates both column and offset policies. + .with_page_index_policy(PageIndexPolicy::from( + self.parquet_read_options.preload_page_index(), + )) + .with_column_index_policy(PageIndexPolicy::from( + self.parquet_read_options.preload_column_index(), + )) + .with_offset_index_policy(PageIndexPolicy::from( + self.parquet_read_options.preload_offset_index(), + )); + let size = self.meta.size; + let meta = reader.load_and_finish(self, size).await?; + + Ok(Arc::new(meta)) + } + .boxed() + } +} + +/// Merge overlapping or nearby byte ranges, combining ranges with gaps <= `coalesce` bytes. +/// Adapted from object_store's `merge_ranges` in `util.rs`. +fn merge_ranges(ranges: &[Range], coalesce: u64) -> Vec> { + if ranges.is_empty() { + return vec![]; + } + + let mut ranges = ranges.to_vec(); + ranges.sort_unstable_by_key(|r| r.start); + + let mut merged = Vec::with_capacity(ranges.len()); + let mut start_idx = 0; + let mut end_idx = 1; + + while start_idx != ranges.len() { + let mut range_end = ranges[start_idx].end; + + while end_idx != ranges.len() + && ranges[end_idx] + .start + .checked_sub(range_end) + .map(|delta| delta <= coalesce) + .unwrap_or(true) + { + range_end = range_end.max(ranges[end_idx].end); + end_idx += 1; + } + + merged.push(ranges[start_idx].start..range_end); + start_idx = end_idx; + end_idx += 1; + } + + merged +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use parquet::arrow::async_reader::AsyncFileReader; + + use super::{ArrowFileReader, ParquetReadOptions, merge_ranges}; + use crate::io::{FileMetadata, FileRead}; + + #[test] + fn test_merge_ranges_empty() { + assert_eq!(merge_ranges(&[], 1024), Vec::>::new()); + } + + #[test] + fn test_merge_ranges_no_coalesce() { + // Ranges far apart should not be merged + let ranges = vec![0..100, 1_000_000..1_000_100]; + let merged = merge_ranges(&ranges, 1024); + assert_eq!(merged, vec![0..100, 1_000_000..1_000_100]); + } + + #[test] + fn test_merge_ranges_coalesce() { + // Ranges within the gap threshold should be merged + let ranges = vec![0..100, 200..300, 500..600]; + let merged = merge_ranges(&ranges, 1024); + assert_eq!(merged, vec![0..600]); + } + + #[test] + fn test_merge_ranges_overlapping() { + let ranges = vec![0..200, 100..300]; + let merged = merge_ranges(&ranges, 0); + assert_eq!(merged, vec![0..300]); + } + + #[test] + fn test_merge_ranges_unsorted() { + let ranges = vec![500..600, 0..100, 200..300]; + let merged = merge_ranges(&ranges, 1024); + assert_eq!(merged, vec![0..600]); + } + + /// Mock FileRead backed by a flat byte buffer. + struct MockFileRead { + data: bytes::Bytes, + } + + impl MockFileRead { + fn new(size: usize) -> Self { + // Fill with sequential byte values so slices are verifiable. + let data: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + Self { + data: bytes::Bytes::from(data), + } + } + } + + #[async_trait::async_trait] + impl FileRead for MockFileRead { + async fn read(&self, range: Range) -> crate::Result { + Ok(self.data.slice(range.start as usize..range.end as usize)) + } + } + + #[tokio::test] + async fn test_get_byte_ranges_no_coalesce() { + let mock = MockFileRead::new(2048); + let expected_0 = mock.data.slice(0..100); + let expected_1 = mock.data.slice(1500..1600); + + let mut reader = ArrowFileReader::new(FileMetadata { size: 2048 }, Box::new(mock)) + .with_parquet_read_options( + ParquetReadOptions::builder() + .with_range_coalesce_bytes(0) + .build(), + ); + + let result = reader + .get_byte_ranges(vec![0..100, 1500..1600]) + .await + .unwrap(); + + assert_eq!(result.len(), 2); + assert_eq!(result[0], expected_0); + assert_eq!(result[1], expected_1); + } + + #[tokio::test] + async fn test_get_byte_ranges_with_coalesce() { + let mock = MockFileRead::new(1024); + let expected_0 = mock.data.slice(0..100); + let expected_1 = mock.data.slice(200..300); + let expected_2 = mock.data.slice(500..600); + + let mut reader = ArrowFileReader::new(FileMetadata { size: 1024 }, Box::new(mock)) + .with_parquet_read_options( + ParquetReadOptions::builder() + .with_range_coalesce_bytes(1024) + .build(), + ); + + // All ranges within coalesce threshold — should merge into one fetch. + let result = reader + .get_byte_ranges(vec![0..100, 200..300, 500..600]) + .await + .unwrap(); + + assert_eq!(result.len(), 3); + assert_eq!(result[0], expected_0); + assert_eq!(result[1], expected_1); + assert_eq!(result[2], expected_2); + } + + #[tokio::test] + async fn test_get_byte_ranges_empty() { + let mock = MockFileRead::new(1024); + let mut reader = ArrowFileReader::new(FileMetadata { size: 1024 }, Box::new(mock)); + + let result = reader.get_byte_ranges(vec![]).await.unwrap(); + assert!(result.is_empty()); + } + + #[tokio::test] + async fn test_get_byte_ranges_coalesce_max() { + let mock = MockFileRead::new(2048); + let expected_0 = mock.data.slice(0..100); + let expected_1 = mock.data.slice(1500..1600); + + let mut reader = ArrowFileReader::new(FileMetadata { size: 2048 }, Box::new(mock)) + .with_parquet_read_options( + ParquetReadOptions::builder() + .with_range_coalesce_bytes(u64::MAX) + .build(), + ); + + // u64::MAX coalesce — all ranges merge into a single fetch. + let result = reader + .get_byte_ranges(vec![0..100, 1500..1600]) + .await + .unwrap(); + + assert_eq!(result.len(), 2); + assert_eq!(result[0], expected_0); + assert_eq!(result[1], expected_1); + } + + #[tokio::test] + async fn test_get_byte_ranges_concurrency_zero() { + // concurrency=0 is clamped to 1, so this should not hang. + let mock = MockFileRead::new(1024); + let expected = mock.data.slice(0..100); + + let mut reader = ArrowFileReader::new(FileMetadata { size: 1024 }, Box::new(mock)) + .with_parquet_read_options( + ParquetReadOptions::builder() + .with_range_fetch_concurrency(0) + .build(), + ); + + let result = reader + .get_byte_ranges(vec![0..100, 200..300]) + .await + .unwrap(); + assert_eq!(result.len(), 2); + assert_eq!(result[0], expected); + } + + #[tokio::test] + async fn test_get_byte_ranges_concurrency_one() { + let mock = MockFileRead::new(2048); + let expected_0 = mock.data.slice(0..100); + let expected_1 = mock.data.slice(500..600); + let expected_2 = mock.data.slice(1500..1600); + + let mut reader = ArrowFileReader::new(FileMetadata { size: 2048 }, Box::new(mock)) + .with_parquet_read_options( + ParquetReadOptions::builder() + .with_range_coalesce_bytes(0) + .with_range_fetch_concurrency(1) + .build(), + ); + + // concurrency=1 with no coalescing — sequential fetches. + let result = reader + .get_byte_ranges(vec![0..100, 500..600, 1500..1600]) + .await + .unwrap(); + + assert_eq!(result.len(), 3); + assert_eq!(result[0], expected_0); + assert_eq!(result[1], expected_1); + assert_eq!(result[2], expected_2); + } +} diff --git a/crates/iceberg/src/arrow/reader/mod.rs b/crates/iceberg/src/arrow/reader/mod.rs new file mode 100644 index 0000000000..c6c41accb7 --- /dev/null +++ b/crates/iceberg/src/arrow/reader/mod.rs @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Parquet file data reader + +use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; +use crate::io::FileIO; +use crate::util::available_parallelism; + +/// Default gap between byte ranges below which they are coalesced into a +/// single request. Matches object_store's `OBJECT_STORE_COALESCE_DEFAULT`. +const DEFAULT_RANGE_COALESCE_BYTES: u64 = 1024 * 1024; + +/// Default maximum number of coalesced byte ranges fetched concurrently. +/// Matches object_store's `OBJECT_STORE_COALESCE_PARALLEL`. +const DEFAULT_RANGE_FETCH_CONCURRENCY: usize = 10; + +/// Default number of bytes to prefetch when parsing Parquet footer metadata. +/// Matches DataFusion's default `ParquetOptions::metadata_size_hint`. +const DEFAULT_METADATA_SIZE_HINT: usize = 512 * 1024; + +mod file_reader; +mod options; +mod pipeline; +mod positional_deletes; +mod predicate_visitor; +mod projection; +mod row_filter; +pub use file_reader::ArrowFileReader; +pub(crate) use options::ParquetReadOptions; +use predicate_visitor::{CollectFieldIdVisitor, PredicateConverter}; +use projection::{add_fallback_field_ids_to_arrow_schema, apply_name_mapping_to_arrow_schema}; + +/// Builder to create ArrowReader +pub struct ArrowReaderBuilder { + batch_size: Option, + file_io: FileIO, + concurrency_limit_data_files: usize, + row_group_filtering_enabled: bool, + row_selection_enabled: bool, + parquet_read_options: ParquetReadOptions, +} + +impl ArrowReaderBuilder { + /// Create a new ArrowReaderBuilder + pub fn new(file_io: FileIO) -> Self { + let num_cpus = available_parallelism().get(); + + ArrowReaderBuilder { + batch_size: None, + file_io, + concurrency_limit_data_files: num_cpus, + row_group_filtering_enabled: true, + row_selection_enabled: false, + parquet_read_options: ParquetReadOptions::builder().build(), + } + } + + /// Sets the max number of in flight data files that are being fetched + pub fn with_data_file_concurrency_limit(mut self, val: usize) -> Self { + self.concurrency_limit_data_files = val; + self + } + + /// Sets the desired size of batches in the response + /// to something other than the default + pub fn with_batch_size(mut self, batch_size: usize) -> Self { + self.batch_size = Some(batch_size); + self + } + + /// Determines whether to enable row group filtering. + pub fn with_row_group_filtering_enabled(mut self, row_group_filtering_enabled: bool) -> Self { + self.row_group_filtering_enabled = row_group_filtering_enabled; + self + } + + /// Determines whether to enable row selection. + pub fn with_row_selection_enabled(mut self, row_selection_enabled: bool) -> Self { + self.row_selection_enabled = row_selection_enabled; + self + } + + /// Provide a hint as to the number of bytes to prefetch for parsing the Parquet metadata + /// + /// This hint can help reduce the number of fetch requests. For more details see the + /// [ParquetMetaDataReader documentation](https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataReader.html#method.with_prefetch_hint). + pub fn with_metadata_size_hint(mut self, metadata_size_hint: usize) -> Self { + self.parquet_read_options.metadata_size_hint = Some(metadata_size_hint); + self + } + + /// Sets the gap threshold for merging nearby byte ranges into a single request. + /// Ranges with gaps smaller than this value will be coalesced. + /// + /// Defaults to 1 MiB, matching object_store's OBJECT_STORE_COALESCE_DEFAULT. + pub fn with_range_coalesce_bytes(mut self, range_coalesce_bytes: u64) -> Self { + self.parquet_read_options.range_coalesce_bytes = range_coalesce_bytes; + self + } + + /// Sets the maximum number of merged byte ranges to fetch concurrently. + /// + /// Defaults to 10, matching object_store's OBJECT_STORE_COALESCE_PARALLEL. + pub fn with_range_fetch_concurrency(mut self, range_fetch_concurrency: usize) -> Self { + self.parquet_read_options.range_fetch_concurrency = range_fetch_concurrency; + self + } + + /// Build the ArrowReader. + pub fn build(self) -> ArrowReader { + ArrowReader { + batch_size: self.batch_size, + file_io: self.file_io.clone(), + delete_file_loader: CachingDeleteFileLoader::new( + self.file_io.clone(), + self.concurrency_limit_data_files, + ), + concurrency_limit_data_files: self.concurrency_limit_data_files, + row_group_filtering_enabled: self.row_group_filtering_enabled, + row_selection_enabled: self.row_selection_enabled, + parquet_read_options: self.parquet_read_options, + } + } +} + +/// Reads data from Parquet files +#[derive(Clone)] +pub struct ArrowReader { + batch_size: Option, + file_io: FileIO, + delete_file_loader: CachingDeleteFileLoader, + + /// the maximum number of data files that can be fetched at the same time + concurrency_limit_data_files: usize, + + row_group_filtering_enabled: bool, + row_selection_enabled: bool, + parquet_read_options: ParquetReadOptions, +} diff --git a/crates/iceberg/src/arrow/reader/options.rs b/crates/iceberg/src/arrow/reader/options.rs new file mode 100644 index 0000000000..ae6a3ed18e --- /dev/null +++ b/crates/iceberg/src/arrow/reader/options.rs @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tunables for Parquet file I/O used by `ArrowReader`. + +use typed_builder::TypedBuilder; + +use super::{ + DEFAULT_METADATA_SIZE_HINT, DEFAULT_RANGE_COALESCE_BYTES, DEFAULT_RANGE_FETCH_CONCURRENCY, +}; + +/// Options for tuning Parquet file I/O. +#[derive(Clone, Copy, Debug, TypedBuilder)] +#[builder(field_defaults(setter(prefix = "with_")))] +pub(crate) struct ParquetReadOptions { + /// Number of bytes to prefetch for parsing the Parquet metadata. + /// + /// This hint can help reduce the number of fetch requests. For more details see the + /// [ParquetMetaDataReader documentation](https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataReader.html#method.with_prefetch_hint). + /// + /// Defaults to 512 KiB, matching DataFusion's default `ParquetOptions::metadata_size_hint`. + #[builder(default = Some(DEFAULT_METADATA_SIZE_HINT))] + pub(crate) metadata_size_hint: Option, + /// Gap threshold for merging nearby byte ranges into a single request. + /// Ranges with gaps smaller than this value will be coalesced. + /// + /// Defaults to 1 MiB, matching object_store's `OBJECT_STORE_COALESCE_DEFAULT`. + #[builder(default = DEFAULT_RANGE_COALESCE_BYTES)] + pub(crate) range_coalesce_bytes: u64, + /// Maximum number of merged byte ranges to fetch concurrently. + /// + /// Defaults to 10, matching object_store's `OBJECT_STORE_COALESCE_PARALLEL`. + #[builder(default = DEFAULT_RANGE_FETCH_CONCURRENCY)] + pub(crate) range_fetch_concurrency: usize, + /// Whether to preload the column index when reading Parquet metadata. + #[builder(default = true)] + pub(crate) preload_column_index: bool, + /// Whether to preload the offset index when reading Parquet metadata. + #[builder(default = true)] + pub(crate) preload_offset_index: bool, + /// Whether to preload the page index when reading Parquet metadata. + #[builder(default = false)] + pub(crate) preload_page_index: bool, +} + +impl ParquetReadOptions { + pub(crate) fn metadata_size_hint(&self) -> Option { + self.metadata_size_hint + } + + pub(crate) fn range_coalesce_bytes(&self) -> u64 { + self.range_coalesce_bytes + } + + pub(crate) fn range_fetch_concurrency(&self) -> usize { + self.range_fetch_concurrency + } + + pub(crate) fn preload_column_index(&self) -> bool { + self.preload_column_index + } + + pub(crate) fn preload_offset_index(&self) -> bool { + self.preload_offset_index + } + + pub(crate) fn preload_page_index(&self) -> bool { + self.preload_page_index + } +} diff --git a/crates/iceberg/src/arrow/reader/pipeline.rs b/crates/iceberg/src/arrow/reader/pipeline.rs new file mode 100644 index 0000000000..94059fc62b --- /dev/null +++ b/crates/iceberg/src/arrow/reader/pipeline.rs @@ -0,0 +1,1174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! The main `ArrowReader` pipeline: reading a stream of `FileScanTask`s, +//! opening Parquet files and resolving schemas, then wiring projection, +//! predicates, row-group / row selection, and delete handling into a stream +//! of transformed Arrow `RecordBatch`es. + +use std::sync::Arc; + +use futures::{StreamExt, TryStreamExt}; +use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}; +use parquet::arrow::{PARQUET_FIELD_ID_META_KEY, ParquetRecordBatchStreamBuilder}; + +use super::{ + ArrowFileReader, ArrowReader, ParquetReadOptions, add_fallback_field_ids_to_arrow_schema, + apply_name_mapping_to_arrow_schema, +}; +use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; +use crate::arrow::int96::coerce_int96_timestamps; +use crate::arrow::record_batch_transformer::RecordBatchTransformerBuilder; +use crate::error::Result; +use crate::io::{FileIO, FileMetadata}; +use crate::metadata_columns::{RESERVED_FIELD_ID_FILE, is_metadata_field}; +use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream}; +use crate::spec::Datum; +use crate::{Error, ErrorKind}; + +impl ArrowReader { + /// Take a stream of FileScanTasks and reads all the files. + /// Returns a stream of Arrow RecordBatches containing the data from the files + pub fn read(self, tasks: FileScanTaskStream) -> Result { + let file_io = self.file_io.clone(); + let batch_size = self.batch_size; + let concurrency_limit_data_files = self.concurrency_limit_data_files; + let row_group_filtering_enabled = self.row_group_filtering_enabled; + let row_selection_enabled = self.row_selection_enabled; + let parquet_read_options = self.parquet_read_options; + + // Fast-path for single concurrency to avoid overhead of try_flatten_unordered + let stream: ArrowRecordBatchStream = if concurrency_limit_data_files == 1 { + Box::pin( + tasks + .and_then(move |task| { + let file_io = file_io.clone(); + + Self::process_file_scan_task( + task, + batch_size, + file_io, + self.delete_file_loader.clone(), + row_group_filtering_enabled, + row_selection_enabled, + parquet_read_options, + ) + }) + .map_err(|err| { + Error::new(ErrorKind::Unexpected, "file scan task generate failed") + .with_source(err) + }) + .try_flatten(), + ) + } else { + Box::pin( + tasks + .map_ok(move |task| { + let file_io = file_io.clone(); + + Self::process_file_scan_task( + task, + batch_size, + file_io, + self.delete_file_loader.clone(), + row_group_filtering_enabled, + row_selection_enabled, + parquet_read_options, + ) + }) + .map_err(|err| { + Error::new(ErrorKind::Unexpected, "file scan task generate failed") + .with_source(err) + }) + .try_buffer_unordered(concurrency_limit_data_files) + .try_flatten_unordered(concurrency_limit_data_files), + ) + }; + + Ok(stream) + } + + async fn process_file_scan_task( + task: FileScanTask, + batch_size: Option, + file_io: FileIO, + delete_file_loader: CachingDeleteFileLoader, + row_group_filtering_enabled: bool, + row_selection_enabled: bool, + parquet_read_options: ParquetReadOptions, + ) -> Result { + let should_load_page_index = + (row_selection_enabled && task.predicate.is_some()) || !task.deletes.is_empty(); + let mut parquet_read_options = parquet_read_options; + parquet_read_options.preload_page_index = should_load_page_index; + + let delete_filter_rx = + delete_file_loader.load_deletes(&task.deletes, Arc::clone(&task.schema)); + + // Open the Parquet file once, loading its metadata + let (parquet_file_reader, arrow_metadata) = Self::open_parquet_file( + &task.data_file_path, + &file_io, + task.file_size_in_bytes, + parquet_read_options, + ) + .await?; + + // Check if Parquet file has embedded field IDs + // Corresponds to Java's ParquetSchemaUtil.hasIds() + // Reference: parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java:118 + let missing_field_ids = arrow_metadata + .schema() + .fields() + .iter() + .next() + .is_some_and(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()); + + // Three-branch schema resolution strategy matching Java's ReadConf constructor + // + // Per Iceberg spec Column Projection rules: + // "Columns in Iceberg data files are selected by field id. The table schema's column + // names and order may change after a data file is written, and projection must be done + // using field ids." + // https://iceberg.apache.org/spec/#column-projection + // + // When Parquet files lack field IDs (e.g., Hive/Spark migrations via add_files), + // we must assign field IDs BEFORE reading data to enable correct projection. + // + // Java's ReadConf determines field ID strategy: + // - Branch 1: hasIds(fileSchema) → trust embedded field IDs, use pruneColumns() + // - Branch 2: nameMapping present → applyNameMapping(), then pruneColumns() + // - Branch 3: fallback → addFallbackIds(), then pruneColumnsFallback() + let arrow_metadata = if missing_field_ids { + // Parquet file lacks field IDs - must assign them before reading + let arrow_schema = if let Some(name_mapping) = &task.name_mapping { + // Branch 2: Apply name mapping to assign correct Iceberg field IDs + // Per spec rule #2: "Use schema.name-mapping.default metadata to map field id + // to columns without field id" + // Corresponds to Java's ParquetSchemaUtil.applyNameMapping() + apply_name_mapping_to_arrow_schema( + Arc::clone(arrow_metadata.schema()), + name_mapping, + )? + } else { + // Branch 3: No name mapping - use position-based fallback IDs + // Corresponds to Java's ParquetSchemaUtil.addFallbackIds() + add_fallback_field_ids_to_arrow_schema(arrow_metadata.schema()) + }; + + let options = ArrowReaderOptions::new().with_schema(arrow_schema); + ArrowReaderMetadata::try_new(Arc::clone(arrow_metadata.metadata()), options).map_err( + |e| { + Error::new( + ErrorKind::Unexpected, + "Failed to create ArrowReaderMetadata with field ID schema", + ) + .with_source(e) + }, + )? + } else { + // Branch 1: File has embedded field IDs - trust them + arrow_metadata + }; + + // Coerce INT96 timestamp columns to the resolution specified by the Iceberg schema. + // This must happen before building the stream reader to avoid i64 overflow in arrow-rs. + let arrow_metadata = if let Some(coerced_schema) = + coerce_int96_timestamps(arrow_metadata.schema(), &task.schema) + { + let options = ArrowReaderOptions::new().with_schema(Arc::clone(&coerced_schema)); + ArrowReaderMetadata::try_new(Arc::clone(arrow_metadata.metadata()), options).map_err( + |e| { + Error::new( + ErrorKind::Unexpected, + format!( + "Failed to create ArrowReaderMetadata with INT96-coerced schema: {coerced_schema}" + ), + ) + .with_source(e) + }, + )? + } else { + arrow_metadata + }; + + // Build the stream reader, reusing the already-opened file reader + let mut record_batch_stream_builder = + ParquetRecordBatchStreamBuilder::new_with_metadata(parquet_file_reader, arrow_metadata); + + // Filter out metadata fields for Parquet projection (they don't exist in files) + let project_field_ids_without_metadata: Vec = task + .project_field_ids + .iter() + .filter(|&&id| !is_metadata_field(id)) + .copied() + .collect(); + + // Create projection mask based on field IDs + // - If file has embedded IDs: field-ID-based projection (missing_field_ids=false) + // - If name mapping applied: field-ID-based projection (missing_field_ids=true but IDs now match) + // - If fallback IDs: position-based projection (missing_field_ids=true) + let projection_mask = Self::get_arrow_projection_mask( + &project_field_ids_without_metadata, + &task.schema, + record_batch_stream_builder.parquet_schema(), + record_batch_stream_builder.schema(), + missing_field_ids, // Whether to use position-based (true) or field-ID-based (false) projection + )?; + + record_batch_stream_builder = + record_batch_stream_builder.with_projection(projection_mask.clone()); + + // RecordBatchTransformer performs any transformations required on the RecordBatches + // that come back from the file, such as type promotion, default column insertion, + // column re-ordering, partition constants, and virtual field addition (like _file) + let mut record_batch_transformer_builder = + RecordBatchTransformerBuilder::new(task.schema_ref(), task.project_field_ids()); + + // Add the _file metadata column if it's in the projected fields + if task.project_field_ids().contains(&RESERVED_FIELD_ID_FILE) { + let file_datum = Datum::string(task.data_file_path.clone()); + record_batch_transformer_builder = + record_batch_transformer_builder.with_constant(RESERVED_FIELD_ID_FILE, file_datum); + } + + if let (Some(partition_spec), Some(partition_data)) = + (task.partition_spec.clone(), task.partition.clone()) + { + record_batch_transformer_builder = + record_batch_transformer_builder.with_partition(partition_spec, partition_data)?; + } + + let mut record_batch_transformer = record_batch_transformer_builder.build(); + + if let Some(batch_size) = batch_size { + record_batch_stream_builder = record_batch_stream_builder.with_batch_size(batch_size); + } + + let delete_filter = delete_filter_rx.await.unwrap()?; + let delete_predicate = delete_filter.build_equality_delete_predicate(&task).await?; + + // In addition to the optional predicate supplied in the `FileScanTask`, + // we also have an optional predicate resulting from equality delete files. + // If both are present, we logical-AND them together to form a single filter + // predicate that we can pass to the `RecordBatchStreamBuilder`. + let final_predicate = match (&task.predicate, delete_predicate) { + (None, None) => None, + (Some(predicate), None) => Some(predicate.clone()), + (None, Some(ref predicate)) => Some(predicate.clone()), + (Some(filter_predicate), Some(delete_predicate)) => { + Some(filter_predicate.clone().and(delete_predicate)) + } + }; + + // There are three possible sources for potential lists of selected RowGroup indices, + // and two for `RowSelection`s. + // Selected RowGroup index lists can come from three sources: + // * When task.start and task.length specify a byte range (file splitting); + // * When there are equality delete files that are applicable; + // * When there is a scan predicate and row_group_filtering_enabled = true. + // `RowSelection`s can be created in either or both of the following cases: + // * When there are positional delete files that are applicable; + // * When there is a scan predicate and row_selection_enabled = true + // Note that row group filtering from predicates only happens when + // there is a scan predicate AND row_group_filtering_enabled = true, + // but we perform row selection filtering if there are applicable + // equality delete files OR (there is a scan predicate AND row_selection_enabled), + // since the only implemented method of applying positional deletes is + // by using a `RowSelection`. + let mut selected_row_group_indices = None; + let mut row_selection = None; + + // Filter row groups based on byte range from task.start and task.length. + // If both start and length are 0, read the entire file (backwards compatibility). + if task.start != 0 || task.length != 0 { + let byte_range_filtered_row_groups = Self::filter_row_groups_by_byte_range( + record_batch_stream_builder.metadata(), + task.start, + task.length, + )?; + selected_row_group_indices = Some(byte_range_filtered_row_groups); + } + + if let Some(predicate) = final_predicate { + let (iceberg_field_ids, field_id_map) = Self::build_field_id_set_and_map( + record_batch_stream_builder.parquet_schema(), + &predicate, + )?; + + let row_filter = Self::get_row_filter( + &predicate, + record_batch_stream_builder.parquet_schema(), + &iceberg_field_ids, + &field_id_map, + )?; + record_batch_stream_builder = record_batch_stream_builder.with_row_filter(row_filter); + + if row_group_filtering_enabled { + let predicate_filtered_row_groups = Self::get_selected_row_group_indices( + &predicate, + record_batch_stream_builder.metadata(), + &field_id_map, + &task.schema, + )?; + + // Merge predicate-based filtering with byte range filtering (if present) + // by taking the intersection of both filters + selected_row_group_indices = match selected_row_group_indices { + Some(byte_range_filtered) => { + // Keep only row groups that are in both filters + let intersection: Vec = byte_range_filtered + .into_iter() + .filter(|idx| predicate_filtered_row_groups.contains(idx)) + .collect(); + Some(intersection) + } + None => Some(predicate_filtered_row_groups), + }; + } + + if row_selection_enabled { + row_selection = Some(Self::get_row_selection_for_filter_predicate( + &predicate, + record_batch_stream_builder.metadata(), + &selected_row_group_indices, + &field_id_map, + &task.schema, + )?); + } + } + + let positional_delete_indexes = delete_filter.get_delete_vector(&task); + + if let Some(positional_delete_indexes) = positional_delete_indexes { + let delete_row_selection = { + let positional_delete_indexes = positional_delete_indexes.lock().unwrap(); + + Self::build_deletes_row_selection( + record_batch_stream_builder.metadata().row_groups(), + &selected_row_group_indices, + &positional_delete_indexes, + ) + }?; + + // merge the row selection from the delete files with the row selection + // from the filter predicate, if there is one from the filter predicate + row_selection = match row_selection { + None => Some(delete_row_selection), + Some(filter_row_selection) => { + Some(filter_row_selection.intersection(&delete_row_selection)) + } + }; + } + + if let Some(row_selection) = row_selection { + record_batch_stream_builder = + record_batch_stream_builder.with_row_selection(row_selection); + } + + if let Some(selected_row_group_indices) = selected_row_group_indices { + record_batch_stream_builder = + record_batch_stream_builder.with_row_groups(selected_row_group_indices); + } + + // Build the batch stream and send all the RecordBatches that it generates + // to the requester. + let record_batch_stream = + record_batch_stream_builder + .build()? + .map(move |batch| match batch { + Ok(batch) => { + // Process the record batch (type promotion, column reordering, virtual fields, etc.) + record_batch_transformer.process_record_batch(batch) + } + Err(err) => Err(err.into()), + }); + + Ok(Box::pin(record_batch_stream) as ArrowRecordBatchStream) + } + + /// Opens a Parquet file and loads its metadata, returning both the reader and metadata. + /// The reader can be reused to build a `ParquetRecordBatchStreamBuilder` without + /// reopening the file. + pub(crate) async fn open_parquet_file( + data_file_path: &str, + file_io: &FileIO, + file_size_in_bytes: u64, + parquet_read_options: ParquetReadOptions, + ) -> Result<(ArrowFileReader, ArrowReaderMetadata)> { + let parquet_file = file_io.new_input(data_file_path)?; + let parquet_reader = parquet_file.reader().await?; + let mut reader = ArrowFileReader::new( + FileMetadata { + size: file_size_in_bytes, + }, + parquet_reader, + ) + .with_parquet_read_options(parquet_read_options); + + let arrow_metadata = ArrowReaderMetadata::load_async(&mut reader, Default::default()) + .await + .map_err(|e| { + Error::new(ErrorKind::Unexpected, "Failed to load Parquet metadata").with_source(e) + })?; + + Ok((reader, arrow_metadata)) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::fs::File; + use std::sync::Arc; + + use arrow_array::cast::AsArray; + use arrow_array::{Array, ArrayRef, RecordBatch}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use futures::TryStreamExt; + use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY}; + use parquet::basic::Compression; + use parquet::file::properties::WriterProperties; + use tempfile::TempDir; + + use crate::arrow::ArrowReaderBuilder; + use crate::io::FileIO; + use crate::scan::{FileScanTask, FileScanTaskStream}; + use crate::spec::{DataFileFormat, NestedField, PrimitiveType, Schema, SchemaRef, Type}; + + // INT96 encoding: [nanos_low_u32, nanos_high_u32, julian_day_u32] + // Julian day 2_440_588 = Unix epoch (1970-01-01) + const UNIX_EPOCH_JULIAN: i64 = 2_440_588; + const MICROS_PER_DAY: i64 = 86_400_000_000; + // Noon on 3333-01-01 (Julian day 2_953_529) — outside the i64 nanosecond range (~1677-2262). + const INT96_TEST_NANOS_WITHIN_DAY: u64 = 43_200_000_000_000; + const INT96_TEST_JULIAN_DAY: u32 = 2_953_529; + + fn make_int96_test_value() -> (parquet::data_type::Int96, i64) { + let mut val = parquet::data_type::Int96::new(); + val.set_data( + (INT96_TEST_NANOS_WITHIN_DAY & 0xFFFFFFFF) as u32, + (INT96_TEST_NANOS_WITHIN_DAY >> 32) as u32, + INT96_TEST_JULIAN_DAY, + ); + let expected_micros = (INT96_TEST_JULIAN_DAY as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + + (INT96_TEST_NANOS_WITHIN_DAY / 1_000) as i64; + (val, expected_micros) + } + + async fn read_int96_batches( + file_path: &str, + schema: SchemaRef, + project_field_ids: Vec, + ) -> Vec { + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let file_size = std::fs::metadata(file_path).unwrap().len(); + let task = FileScanTask { + file_size_in_bytes: file_size, + start: 0, + length: file_size, + record_count: None, + data_file_path: file_path.to_string(), + data_file_format: DataFileFormat::Parquet, + schema, + project_field_ids, + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + reader.read(tasks).unwrap().try_collect().await.unwrap() + } + + // ArrowWriter cannot write INT96, so we use SerializedFileWriter directly. + fn write_int96_parquet_file( + table_location: &str, + filename: &str, + with_field_ids: bool, + ) -> (String, Vec) { + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::{Int32Type, Int96, Int96Type}; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let file_path = format!("{table_location}/{filename}"); + + let mut ts_builder = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL); + let mut id_builder = SchemaType::primitive_type_builder("id", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED); + + if with_field_ids { + ts_builder = ts_builder.with_id(Some(1)); + id_builder = id_builder.with_id(Some(2)); + } + + let schema = SchemaType::group_type_builder("schema") + .with_fields(vec![ + Arc::new(ts_builder.build().unwrap()), + Arc::new(id_builder.build().unwrap()), + ]) + .build() + .unwrap(); + + // Dates outside the i64 nanosecond range (~1677-2262) overflow without coercion. + const NOON_NANOS: u64 = INT96_TEST_NANOS_WITHIN_DAY; + const JULIAN_3333: u32 = INT96_TEST_JULIAN_DAY; + const JULIAN_2100: u32 = 2_488_070; + + let test_data: Vec<(u32, u32, u32, i64)> = vec![ + // 3333-01-01 00:00:00 + ( + 0, + 0, + JULIAN_3333, + (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, + ), + // 3333-01-01 12:00:00 + ( + (NOON_NANOS & 0xFFFFFFFF) as u32, + (NOON_NANOS >> 32) as u32, + JULIAN_3333, + (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + + (NOON_NANOS / 1_000) as i64, + ), + // 2100-01-01 00:00:00 + ( + 0, + 0, + JULIAN_2100, + (JULIAN_2100 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, + ), + ]; + + let int96_values: Vec = test_data + .iter() + .map(|(lo, hi, day, _)| { + let mut v = Int96::new(); + v.set_data(*lo, *hi, *day); + v + }) + .collect(); + + let id_values: Vec = (0..test_data.len() as i32).collect(); + let expected_micros: Vec = test_data.iter().map(|(_, _, _, m)| *m).collect(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(schema), Default::default()).unwrap(); + + let mut row_group = writer.next_row_group().unwrap(); + { + // def=1: ts is OPTIONAL and present. No repetition levels (top-level columns). + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&int96_values, Some(&vec![1; test_data.len()]), None) + .unwrap(); + col.close().unwrap(); + } + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&id_values, None, None) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + (file_path, expected_micros) + } + + async fn assert_int96_read_matches( + file_path: &str, + schema: SchemaRef, + project_field_ids: Vec, + expected_micros: &[i64], + ) { + use arrow_array::TimestampMicrosecondArray; + + let batches = read_int96_batches(file_path, schema, project_field_ids).await; + + assert_eq!(batches.len(), 1); + let ts_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray"); + + for (i, expected) in expected_micros.iter().enumerate() { + assert_eq!( + ts_array.value(i), + *expected, + "Row {i}: got {}, expected {expected}", + ts_array.value(i) + ); + } + } + + /// Test that concurrency=1 reads all files correctly and in deterministic order. + /// This verifies the fast-path optimization for single concurrency. + #[tokio::test] + async fn test_read_with_concurrency_one() { + use arrow_array::Int32Array; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "file_num", Type::Primitive(PrimitiveType::Int)) + .into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + Field::new("file_num", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "2".to_string(), + )])), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + // Create 3 parquet files with different data + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + for file_num in 0..3 { + let id_data = Arc::new(Int32Array::from_iter_values( + file_num * 10..(file_num + 1) * 10, + )) as ArrayRef; + let file_num_data = Arc::new(Int32Array::from(vec![file_num; 10])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema.clone(), vec![id_data, file_num_data]).unwrap(); + + let file = File::create(format!("{table_location}/file_{file_num}.parquet")).unwrap(); + let mut writer = + ArrowWriter::try_new(file, to_write.schema(), Some(props.clone())).unwrap(); + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + } + + // Read with concurrency=1 (fast-path) + let reader = ArrowReaderBuilder::new(file_io) + .with_data_file_concurrency_limit(1) + .build(); + + // Create tasks in a specific order: file_0, file_1, file_2 + let tasks = vec![ + Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_0.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/file_0.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }), + Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/file_1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }), + Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_2.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/file_2.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }), + ]; + + let tasks_stream = Box::pin(futures::stream::iter(tasks)) as FileScanTaskStream; + + let result = reader + .read(tasks_stream) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Verify we got all 30 rows (10 from each file) + let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 30, "Should have 30 total rows"); + + // Collect all ids and file_nums to verify data + let mut all_ids = Vec::new(); + let mut all_file_nums = Vec::new(); + + for batch in &result { + let id_col = batch + .column(0) + .as_primitive::(); + let file_num_col = batch + .column(1) + .as_primitive::(); + + for i in 0..batch.num_rows() { + all_ids.push(id_col.value(i)); + all_file_nums.push(file_num_col.value(i)); + } + } + + assert_eq!(all_ids.len(), 30); + assert_eq!(all_file_nums.len(), 30); + + // With concurrency=1 and sequential processing, files should be processed in order + // file_0: ids 0-9, file_num=0 + // file_1: ids 10-19, file_num=1 + // file_2: ids 20-29, file_num=2 + for i in 0..10 { + assert_eq!(all_file_nums[i], 0, "First 10 rows should be from file_0"); + assert_eq!(all_ids[i], i as i32, "IDs should be 0-9"); + } + for i in 10..20 { + assert_eq!(all_file_nums[i], 1, "Next 10 rows should be from file_1"); + assert_eq!(all_ids[i], i as i32, "IDs should be 10-19"); + } + for i in 20..30 { + assert_eq!(all_file_nums[i], 2, "Last 10 rows should be from file_2"); + assert_eq!(all_ids[i], i as i32, "IDs should be 20-29"); + } + } + + #[tokio::test] + async fn test_read_int96_timestamps_with_field_ids() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let (file_path, expected_micros) = + write_int96_parquet_file(&table_location, "with_ids.parquet", true); + + assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; + } + + #[tokio::test] + async fn test_read_int96_timestamps_without_field_ids() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let (file_path, expected_micros) = + write_int96_parquet_file(&table_location, "no_ids.parquet", false); + + assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; + } + + #[tokio::test] + async fn test_read_int96_timestamps_in_struct() { + use arrow_array::{StructArray, TimestampMicrosecondArray}; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::Int96Type; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/struct_int96.parquet"); + + let ts_type = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(2)) + .build() + .unwrap(); + + let struct_type = SchemaType::group_type_builder("data") + .with_repetition(Repetition::REQUIRED) + .with_id(Some(1)) + .with_fields(vec![Arc::new(ts_type)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(struct_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // def=1: struct is REQUIRED so no level, ts is OPTIONAL and present (1). + // No repetition levels needed (no repeated groups). + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[1]), None) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required( + 1, + "data", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::optional( + 2, + "ts", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + ])), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let struct_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected StructArray"); + let ts_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray inside struct"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in struct: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } + + #[tokio::test] + async fn test_read_int96_timestamps_in_list() { + use arrow_array::{ListArray, TimestampMicrosecondArray}; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::Int96Type; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/list_int96.parquet"); + + // 3-level LIST encoding: + // optional group timestamps (LIST) { + // repeated group list { + // optional int96 element; + // } + // } + let element_type = SchemaType::primitive_type_builder("element", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(2)) + .build() + .unwrap(); + + let list_group = SchemaType::group_type_builder("list") + .with_repetition(Repetition::REPEATED) + .with_fields(vec![Arc::new(element_type)]) + .build() + .unwrap(); + + let list_type = SchemaType::group_type_builder("timestamps") + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(1)) + .with_logical_type(Some(parquet::basic::LogicalType::List)) + .with_fields(vec![Arc::new(list_group)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(list_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // Write a single row with a list containing one INT96 element. + // def=3: list present (1) + repeated group (2) + element present (3) + // rep=0: start of a new list + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[3]), Some(&[0])) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "timestamps", + Type::List(crate::spec::ListType { + element_field: NestedField::optional( + 2, + "element", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let list_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected ListArray"); + let ts_array = list_array + .values() + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray inside list"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in list: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } + + #[tokio::test] + async fn test_read_int96_timestamps_in_map() { + use arrow_array::{MapArray, TimestampMicrosecondArray}; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::{ByteArrayType, Int96Type}; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/map_int96.parquet"); + + // MAP encoding: + // optional group ts_map (MAP) { + // repeated group key_value { + // required binary key (UTF8); + // optional int96 value; + // } + // } + let key_type = SchemaType::primitive_type_builder("key", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(Some(parquet::basic::LogicalType::String)) + .with_id(Some(2)) + .build() + .unwrap(); + + let value_type = SchemaType::primitive_type_builder("value", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(3)) + .build() + .unwrap(); + + let key_value_group = SchemaType::group_type_builder("key_value") + .with_repetition(Repetition::REPEATED) + .with_fields(vec![Arc::new(key_type), Arc::new(value_type)]) + .build() + .unwrap(); + + let map_type = SchemaType::group_type_builder("ts_map") + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(1)) + .with_logical_type(Some(parquet::basic::LogicalType::Map)) + .with_fields(vec![Arc::new(key_value_group)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(map_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // Write a single row with a map containing one key-value pair. + // rep=0 for both columns: start of a new map. + // key def=2: map present (1) + key_value entry present (2), key is REQUIRED. + // value def=3: map present (1) + key_value entry present (2) + value present (3). + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch( + &[parquet::data_type::ByteArray::from("event_time")], + Some(&[2]), + Some(&[0]), + ) + .unwrap(); + col.close().unwrap(); + } + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[3]), Some(&[0])) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "ts_map", + Type::Map(crate::spec::MapType { + key_field: NestedField::required( + 2, + "key", + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::optional( + 3, + "value", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let map_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected MapArray"); + let ts_array = map_array + .values() + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray as map values"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in map: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } +} diff --git a/crates/iceberg/src/arrow/reader/positional_deletes.rs b/crates/iceberg/src/arrow/reader/positional_deletes.rs new file mode 100644 index 0000000000..eea031852b --- /dev/null +++ b/crates/iceberg/src/arrow/reader/positional_deletes.rs @@ -0,0 +1,931 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Positional delete handling for `ArrowReader`: converting a `DeleteVector` +//! into a Parquet `RowSelection` that skips the deleted rows, while respecting +//! any row-group selection made by the predicate evaluator. + +use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; +use parquet::file::metadata::RowGroupMetaData; + +use super::ArrowReader; +use crate::delete_vector::DeleteVector; +use crate::error::Result; + +impl ArrowReader { + /// computes a `RowSelection` from positional delete indices. + /// + /// Using the Parquet page index, we build a `RowSelection` that rejects rows that are indicated + /// as having been deleted by a positional delete, taking into account any row groups that have + /// been skipped entirely by the filter predicate + pub(super) fn build_deletes_row_selection( + row_group_metadata_list: &[RowGroupMetaData], + selected_row_groups: &Option>, + positional_deletes: &DeleteVector, + ) -> Result { + let mut results: Vec = Vec::new(); + let mut selected_row_groups_idx = 0; + let mut current_row_group_base_idx: u64 = 0; + let mut delete_vector_iter = positional_deletes.iter(); + let mut next_deleted_row_idx_opt = delete_vector_iter.next(); + + for (idx, row_group_metadata) in row_group_metadata_list.iter().enumerate() { + let row_group_num_rows = row_group_metadata.num_rows() as u64; + let next_row_group_base_idx = current_row_group_base_idx + row_group_num_rows; + + // if row group selection is enabled, + if let Some(selected_row_groups) = selected_row_groups { + // if we've consumed all the selected row groups, we're done + if selected_row_groups_idx == selected_row_groups.len() { + break; + } + + if idx == selected_row_groups[selected_row_groups_idx] { + // we're in a selected row group. Increment selected_row_groups_idx + // so that next time around the for loop we're looking for the next + // selected row group + selected_row_groups_idx += 1; + } else { + // Advance iterator past all deletes in the skipped row group. + // advance_to() positions the iterator to the first delete >= next_row_group_base_idx. + // However, if our cached next_deleted_row_idx_opt is in the skipped range, + // we need to call next() to update the cache with the newly positioned value. + delete_vector_iter.advance_to(next_row_group_base_idx); + // Only update the cache if the cached value is stale (in the skipped range) + if let Some(cached_idx) = next_deleted_row_idx_opt + && cached_idx < next_row_group_base_idx + { + next_deleted_row_idx_opt = delete_vector_iter.next(); + } + + // still increment the current page base index but then skip to the next row group + // in the file + current_row_group_base_idx += row_group_num_rows; + continue; + } + } + + let mut next_deleted_row_idx = match next_deleted_row_idx_opt { + Some(next_deleted_row_idx) => { + // if the index of the next deleted row is beyond this row group, add a selection for + // the remainder of this row group and skip to the next row group + if next_deleted_row_idx >= next_row_group_base_idx { + results.push(RowSelector::select(row_group_num_rows as usize)); + current_row_group_base_idx += row_group_num_rows; + continue; + } + + next_deleted_row_idx + } + + // If there are no more pos deletes, add a selector for the entirety of this row group. + _ => { + results.push(RowSelector::select(row_group_num_rows as usize)); + current_row_group_base_idx += row_group_num_rows; + continue; + } + }; + + let mut current_idx = current_row_group_base_idx; + 'chunks: while next_deleted_row_idx < next_row_group_base_idx { + // `select` all rows that precede the next delete index + if current_idx < next_deleted_row_idx { + let run_length = next_deleted_row_idx - current_idx; + results.push(RowSelector::select(run_length as usize)); + current_idx += run_length; + } + + // `skip` all consecutive deleted rows in the current row group + let mut run_length = 0; + while next_deleted_row_idx == current_idx + && next_deleted_row_idx < next_row_group_base_idx + { + run_length += 1; + current_idx += 1; + + next_deleted_row_idx_opt = delete_vector_iter.next(); + next_deleted_row_idx = match next_deleted_row_idx_opt { + Some(next_deleted_row_idx) => next_deleted_row_idx, + _ => { + // We've processed the final positional delete. + // Conclude the skip and then break so that we select the remaining + // rows in the row group and move on to the next row group + results.push(RowSelector::skip(run_length)); + break 'chunks; + } + }; + } + if run_length > 0 { + results.push(RowSelector::skip(run_length)); + } + } + + if current_idx < next_row_group_base_idx { + results.push(RowSelector::select( + (next_row_group_base_idx - current_idx) as usize, + )); + } + + current_row_group_base_idx += row_group_num_rows; + } + + Ok(results.into()) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::fs::File; + use std::sync::Arc; + + use arrow_array::cast::AsArray; + use arrow_array::{RecordBatch, StringArray}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use futures::TryStreamExt; + use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; + use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY}; + use parquet::basic::Compression; + use parquet::file::metadata::{ColumnChunkMetaData, RowGroupMetaData}; + use parquet::file::properties::WriterProperties; + use parquet::schema::types::{SchemaDescPtr, SchemaDescriptor}; + use roaring::RoaringTreemap; + use tempfile::TempDir; + + use crate::arrow::{ArrowReader, ArrowReaderBuilder}; + use crate::delete_vector::DeleteVector; + use crate::io::FileIO; + use crate::scan::{FileScanTask, FileScanTaskDeleteFile, FileScanTaskStream}; + use crate::spec::{DataContentType, DataFileFormat, NestedField, PrimitiveType, Schema, Type}; + + fn build_test_row_group_meta( + schema_descr: SchemaDescPtr, + columns: Vec, + num_rows: i64, + ordinal: i16, + ) -> RowGroupMetaData { + RowGroupMetaData::builder(schema_descr.clone()) + .set_num_rows(num_rows) + .set_total_byte_size(2000) + .set_column_metadata(columns) + .set_ordinal(ordinal) + .build() + .unwrap() + } + + fn get_test_schema_descr() -> SchemaDescPtr { + use parquet::schema::types::Type as SchemaType; + + let schema = SchemaType::group_type_builder("schema") + .with_fields(vec![ + Arc::new( + SchemaType::primitive_type_builder("a", parquet::basic::Type::INT32) + .build() + .unwrap(), + ), + Arc::new( + SchemaType::primitive_type_builder("b", parquet::basic::Type::INT32) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(); + + Arc::new(SchemaDescriptor::new(Arc::new(schema))) + } + + #[test] + fn test_build_deletes_row_selection() { + let schema_descr = get_test_schema_descr(); + + let mut columns = vec![]; + for ptr in schema_descr.columns() { + let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap(); + columns.push(column); + } + + let row_groups_metadata = vec![ + build_test_row_group_meta(schema_descr.clone(), columns.clone(), 1000, 0), + build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 1), + build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 2), + build_test_row_group_meta(schema_descr.clone(), columns.clone(), 1000, 3), + build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 4), + ]; + + let selected_row_groups = Some(vec![1, 3]); + + /* cases to cover: + * {skip|select} {first|intermediate|last} {one row|multiple rows} in + {first|intermediate|last} {skipped|selected} row group + * row group selection disabled + */ + + let positional_deletes = RoaringTreemap::from_iter(&[ + 1, // in skipped rg 0, should be ignored + 3, // run of three consecutive items in skipped rg0 + 4, 5, 998, // two consecutive items at end of skipped rg0 + 999, 1000, // solitary row at start of selected rg1 (1, 9) + 1010, // run of 3 rows in selected rg1 + 1011, 1012, // (3, 485) + 1498, // run of two items at end of selected rg1 + 1499, 1500, // run of two items at start of skipped rg2 + 1501, 1600, // should ignore, in skipped rg2 + 1999, // single row at end of skipped rg2 + 2000, // run of two items at start of selected rg3 + 2001, // (4, 98) + 2100, // single row in selected row group 3 (1, 99) + 2200, // run of 3 consecutive rows in selected row group 3 + 2201, 2202, // (3, 796) + 2999, // single item at end of selected rg3 (1) + 3000, // single item at start of skipped rg4 + ]); + + let positional_deletes = DeleteVector::new(positional_deletes); + + // using selected row groups 1 and 3 + let result = ArrowReader::build_deletes_row_selection( + &row_groups_metadata, + &selected_row_groups, + &positional_deletes, + ) + .unwrap(); + + let expected = RowSelection::from(vec![ + RowSelector::skip(1), + RowSelector::select(9), + RowSelector::skip(3), + RowSelector::select(485), + RowSelector::skip(4), + RowSelector::select(98), + RowSelector::skip(1), + RowSelector::select(99), + RowSelector::skip(3), + RowSelector::select(796), + RowSelector::skip(1), + ]); + + assert_eq!(result, expected); + + // selecting all row groups + let result = ArrowReader::build_deletes_row_selection( + &row_groups_metadata, + &None, + &positional_deletes, + ) + .unwrap(); + + let expected = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(3), + RowSelector::select(992), + RowSelector::skip(3), + RowSelector::select(9), + RowSelector::skip(3), + RowSelector::select(485), + RowSelector::skip(4), + RowSelector::select(98), + RowSelector::skip(1), + RowSelector::select(398), + RowSelector::skip(3), + RowSelector::select(98), + RowSelector::skip(1), + RowSelector::select(99), + RowSelector::skip(3), + RowSelector::select(796), + RowSelector::skip(2), + RowSelector::select(499), + ]); + + assert_eq!(result, expected); + } + + /// Test for bug where position deletes in later row groups are not applied correctly. + /// + /// When a file has multiple row groups and a position delete targets a row in a later + /// row group, the `build_deletes_row_selection` function had a bug where it would + /// fail to increment `current_row_group_base_idx` when skipping row groups. + /// + /// This test creates: + /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) + /// - A position delete file that deletes row 199 (last row in second row group) + /// + /// Expected behavior: Should return 199 rows (with id=200 deleted) + /// Bug behavior: Returns 200 rows (delete is not applied) + /// + /// This bug was discovered while running Apache Spark + Apache Iceberg integration tests + /// through DataFusion Comet. The following Iceberg Java tests failed due to this bug: + /// - `org.apache.iceberg.spark.extensions.TestMergeOnReadDelete::testDeleteWithMultipleRowGroupsParquet` + /// - `org.apache.iceberg.spark.extensions.TestMergeOnReadUpdate::testUpdateWithMultipleRowGroupsParquet` + #[tokio::test] + async fn test_position_delete_across_multiple_row_groups() { + use arrow_array::{Int32Array, Int64Array}; + use parquet::file::reader::{FileReader, SerializedFileReader}; + + // Field IDs for positional delete schema + const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; + const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + + // Create table schema with a single 'id' column + let table_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + // Step 1: Create data file with 200 rows in 2 row groups + // Row group 0: rows 0-99 (ids 1-100) + // Row group 1: rows 100-199 (ids 101-200) + let data_file_path = format!("{table_location}/data.parquet"); + + let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(1..=100), + )]) + .unwrap(); + + let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(101..=200), + )]) + .unwrap(); + + // Force each batch into its own row group + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .set_max_row_group_row_count(Some(100)) + .build(); + + let file = File::create(&data_file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); + writer.write(&batch1).expect("Writing batch 1"); + writer.write(&batch2).expect("Writing batch 2"); + writer.close().unwrap(); + + // Verify we created 2 row groups + let verify_file = File::open(&data_file_path).unwrap(); + let verify_reader = SerializedFileReader::new(verify_file).unwrap(); + assert_eq!( + verify_reader.metadata().num_row_groups(), + 2, + "Should have 2 row groups" + ); + + // Step 2: Create position delete file that deletes row 199 (id=200, last row in row group 1) + let delete_file_path = format!("{table_location}/deletes.parquet"); + + let delete_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), + )])), + Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_POS.to_string(), + )])), + ])); + + // Delete row at position 199 (0-indexed, so it's the last row: id=200) + let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ + Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), + Arc::new(Int64Array::from_iter_values(vec![199i64])), + ]) + .unwrap(); + + let delete_props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let delete_file = File::create(&delete_file_path).unwrap(); + let mut delete_writer = + ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); + delete_writer.write(&delete_batch).unwrap(); + delete_writer.close().unwrap(); + + // Step 3: Read the data file with the delete applied + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let task = FileScanTask { + file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), + start: 0, + length: 0, + record_count: Some(200), + data_file_path: data_file_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: table_schema.clone(), + project_field_ids: vec![1], + predicate: None, + deletes: vec![FileScanTaskDeleteFile { + file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), + file_path: delete_file_path, + file_type: DataContentType::PositionDeletes, + partition_spec_id: 0, + equality_ids: None, + }], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Step 4: Verify we got 199 rows (not 200) + let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); + + println!("Total rows read: {total_rows}"); + println!("Expected: 199 rows (deleted row 199 which had id=200)"); + + // This assertion will FAIL before the fix and PASS after the fix + assert_eq!( + total_rows, 199, + "Expected 199 rows after deleting row 199, but got {total_rows} rows. \ + The bug causes position deletes in later row groups to be ignored." + ); + + // Verify the deleted row (id=200) is not present + let all_ids: Vec = result + .iter() + .flat_map(|batch| { + batch + .column(0) + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect(); + + assert!( + !all_ids.contains(&200), + "Row with id=200 should be deleted but was found in results" + ); + + // Verify we have all other ids (1-199) + let expected_ids: Vec = (1..=199).collect(); + assert_eq!( + all_ids, expected_ids, + "Should have ids 1-199 but got different values" + ); + } + + /// Test for bug where position deletes are lost when skipping unselected row groups. + /// + /// This is a variant of `test_position_delete_across_multiple_row_groups` that exercises + /// the row group selection code path (`selected_row_groups: Some([...])`). + /// + /// When a file has multiple row groups and only some are selected for reading, + /// the `build_deletes_row_selection` function must correctly skip over deletes in + /// unselected row groups WITHOUT consuming deletes that belong to selected row groups. + /// + /// This test creates: + /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) + /// - A position delete file that deletes row 199 (last row in second row group) + /// - Row group selection that reads ONLY row group 1 (rows 100-199) + /// + /// Expected behavior: Should return 99 rows (with row 199 deleted) + /// Bug behavior: Returns 100 rows (delete is lost when skipping row group 0) + /// + /// The bug occurs when processing row group 0 (unselected): + /// ```rust + /// delete_vector_iter.advance_to(next_row_group_base_idx); // Position at first delete >= 100 + /// next_deleted_row_idx_opt = delete_vector_iter.next(); // BUG: Consumes delete at 199! + /// ``` + /// + /// The fix is to NOT call `next()` after `advance_to()` when skipping unselected row groups, + /// because `advance_to()` already positions the iterator correctly without consuming elements. + #[tokio::test] + async fn test_position_delete_with_row_group_selection() { + use arrow_array::{Int32Array, Int64Array}; + use parquet::file::reader::{FileReader, SerializedFileReader}; + + // Field IDs for positional delete schema + const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; + const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + + // Create table schema with a single 'id' column + let table_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + // Step 1: Create data file with 200 rows in 2 row groups + // Row group 0: rows 0-99 (ids 1-100) + // Row group 1: rows 100-199 (ids 101-200) + let data_file_path = format!("{table_location}/data.parquet"); + + let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(1..=100), + )]) + .unwrap(); + + let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(101..=200), + )]) + .unwrap(); + + // Force each batch into its own row group + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .set_max_row_group_row_count(Some(100)) + .build(); + + let file = File::create(&data_file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); + writer.write(&batch1).expect("Writing batch 1"); + writer.write(&batch2).expect("Writing batch 2"); + writer.close().unwrap(); + + // Verify we created 2 row groups + let verify_file = File::open(&data_file_path).unwrap(); + let verify_reader = SerializedFileReader::new(verify_file).unwrap(); + assert_eq!( + verify_reader.metadata().num_row_groups(), + 2, + "Should have 2 row groups" + ); + + // Step 2: Create position delete file that deletes row 199 (id=200, last row in row group 1) + let delete_file_path = format!("{table_location}/deletes.parquet"); + + let delete_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), + )])), + Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_POS.to_string(), + )])), + ])); + + // Delete row at position 199 (0-indexed, so it's the last row: id=200) + let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ + Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), + Arc::new(Int64Array::from_iter_values(vec![199i64])), + ]) + .unwrap(); + + let delete_props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let delete_file = File::create(&delete_file_path).unwrap(); + let mut delete_writer = + ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); + delete_writer.write(&delete_batch).unwrap(); + delete_writer.close().unwrap(); + + // Step 3: Get byte ranges to read ONLY row group 1 (rows 100-199) + // This exercises the row group selection code path where row group 0 is skipped + let metadata_file = File::open(&data_file_path).unwrap(); + let metadata_reader = SerializedFileReader::new(metadata_file).unwrap(); + let metadata = metadata_reader.metadata(); + + let row_group_0 = metadata.row_group(0); + let row_group_1 = metadata.row_group(1); + + let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" + let rg1_start = rg0_start + row_group_0.compressed_size() as u64; + let rg1_length = row_group_1.compressed_size() as u64; + + println!( + "Row group 0: starts at byte {}, {} bytes compressed", + rg0_start, + row_group_0.compressed_size() + ); + println!( + "Row group 1: starts at byte {}, {} bytes compressed", + rg1_start, + row_group_1.compressed_size() + ); + + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + // Create FileScanTask that reads ONLY row group 1 via byte range filtering + let task = FileScanTask { + file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), + start: rg1_start, + length: rg1_length, + record_count: Some(100), // Row group 1 has 100 rows + data_file_path: data_file_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: table_schema.clone(), + project_field_ids: vec![1], + predicate: None, + deletes: vec![FileScanTaskDeleteFile { + file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), + file_path: delete_file_path, + file_type: DataContentType::PositionDeletes, + partition_spec_id: 0, + equality_ids: None, + }], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Step 4: Verify we got 99 rows (not 100) + // Row group 1 has 100 rows (ids 101-200), minus 1 delete (id=200) = 99 rows + let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); + + println!("Total rows read from row group 1: {total_rows}"); + println!("Expected: 99 rows (row group 1 has 100 rows, 1 delete at position 199)"); + + // This assertion will FAIL before the fix and PASS after the fix + assert_eq!( + total_rows, 99, + "Expected 99 rows from row group 1 after deleting position 199, but got {total_rows} rows. \ + The bug causes position deletes to be lost when advance_to() is followed by next() \ + when skipping unselected row groups." + ); + + // Verify the deleted row (id=200) is not present + let all_ids: Vec = result + .iter() + .flat_map(|batch| { + batch + .column(0) + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect(); + + assert!( + !all_ids.contains(&200), + "Row with id=200 should be deleted but was found in results" + ); + + // Verify we have ids 101-199 (not 101-200) + let expected_ids: Vec = (101..=199).collect(); + assert_eq!( + all_ids, expected_ids, + "Should have ids 101-199 but got different values" + ); + } + + /// Test for bug where stale cached delete causes infinite loop when skipping row groups. + /// + /// This test exposes the inverse scenario of `test_position_delete_with_row_group_selection`: + /// - Position delete targets a row in the SKIPPED row group (not the selected one) + /// - After calling advance_to(), the cached delete index is stale + /// - Without updating the cache, the code enters an infinite loop + /// + /// This test creates: + /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) + /// - A position delete file that deletes row 0 (first row in SKIPPED row group 0) + /// - Row group selection that reads ONLY row group 1 (rows 100-199) + /// + /// The bug occurs when skipping row group 0: + /// ```rust + /// let mut next_deleted_row_idx_opt = delete_vector_iter.next(); // Some(0) + /// // ... skip to row group 1 ... + /// delete_vector_iter.advance_to(100); // Iterator advances past delete at 0 + /// // BUG: next_deleted_row_idx_opt is still Some(0) - STALE! + /// // When processing row group 1: + /// // current_idx = 100, next_deleted_row_idx = 0, next_row_group_base_idx = 200 + /// // Loop condition: 0 < 200 (true) + /// // But: current_idx (100) > next_deleted_row_idx (0) + /// // And: current_idx (100) != next_deleted_row_idx (0) + /// // Neither branch executes -> INFINITE LOOP! + /// ``` + /// + /// Expected behavior: Should return 100 rows (delete at 0 doesn't affect row group 1) + /// Bug behavior: Infinite loop in build_deletes_row_selection + #[tokio::test] + async fn test_position_delete_in_skipped_row_group() { + use arrow_array::{Int32Array, Int64Array}; + use parquet::file::reader::{FileReader, SerializedFileReader}; + + // Field IDs for positional delete schema + const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; + const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + + // Create table schema with a single 'id' column + let table_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + // Step 1: Create data file with 200 rows in 2 row groups + // Row group 0: rows 0-99 (ids 1-100) + // Row group 1: rows 100-199 (ids 101-200) + let data_file_path = format!("{table_location}/data.parquet"); + + let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(1..=100), + )]) + .unwrap(); + + let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(101..=200), + )]) + .unwrap(); + + // Force each batch into its own row group + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .set_max_row_group_row_count(Some(100)) + .build(); + + let file = File::create(&data_file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); + writer.write(&batch1).expect("Writing batch 1"); + writer.write(&batch2).expect("Writing batch 2"); + writer.close().unwrap(); + + // Verify we created 2 row groups + let verify_file = File::open(&data_file_path).unwrap(); + let verify_reader = SerializedFileReader::new(verify_file).unwrap(); + assert_eq!( + verify_reader.metadata().num_row_groups(), + 2, + "Should have 2 row groups" + ); + + // Step 2: Create position delete file that deletes row 0 (id=1, first row in row group 0) + let delete_file_path = format!("{table_location}/deletes.parquet"); + + let delete_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), + )])), + Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_POS.to_string(), + )])), + ])); + + // Delete row at position 0 (0-indexed, so it's the first row: id=1) + let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ + Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), + Arc::new(Int64Array::from_iter_values(vec![0i64])), + ]) + .unwrap(); + + let delete_props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let delete_file = File::create(&delete_file_path).unwrap(); + let mut delete_writer = + ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); + delete_writer.write(&delete_batch).unwrap(); + delete_writer.close().unwrap(); + + // Step 3: Get byte ranges to read ONLY row group 1 (rows 100-199) + // This exercises the row group selection code path where row group 0 is skipped + let metadata_file = File::open(&data_file_path).unwrap(); + let metadata_reader = SerializedFileReader::new(metadata_file).unwrap(); + let metadata = metadata_reader.metadata(); + + let row_group_0 = metadata.row_group(0); + let row_group_1 = metadata.row_group(1); + + let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" + let rg1_start = rg0_start + row_group_0.compressed_size() as u64; + let rg1_length = row_group_1.compressed_size() as u64; + + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + // Create FileScanTask that reads ONLY row group 1 via byte range filtering + let task = FileScanTask { + file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), + start: rg1_start, + length: rg1_length, + record_count: Some(100), // Row group 1 has 100 rows + data_file_path: data_file_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: table_schema.clone(), + project_field_ids: vec![1], + predicate: None, + deletes: vec![FileScanTaskDeleteFile { + file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), + file_path: delete_file_path, + file_type: DataContentType::PositionDeletes, + partition_spec_id: 0, + equality_ids: None, + }], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Step 4: Verify we got 100 rows (all of row group 1) + // The delete at position 0 is in row group 0, which is skipped, so it doesn't affect us + let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); + + assert_eq!( + total_rows, 100, + "Expected 100 rows from row group 1 (delete at position 0 is in skipped row group 0). \ + If this hangs or fails, it indicates the cached delete index was not updated after advance_to()." + ); + + // Verify we have all ids from row group 1 (101-200) + let all_ids: Vec = result + .iter() + .flat_map(|batch| { + batch + .column(0) + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect(); + + let expected_ids: Vec = (101..=200).collect(); + assert_eq!( + all_ids, expected_ids, + "Should have ids 101-200 (all of row group 1)" + ); + } +} diff --git a/crates/iceberg/src/arrow/reader/predicate_visitor.rs b/crates/iceberg/src/arrow/reader/predicate_visitor.rs new file mode 100644 index 0000000000..272de49390 --- /dev/null +++ b/crates/iceberg/src/arrow/reader/predicate_visitor.rs @@ -0,0 +1,820 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Visitors that translate Iceberg bound predicates into the pieces needed for +//! Arrow-level evaluation: collecting referenced field IDs and producing +//! per-record-batch predicate closures. + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use arrow_arith::boolean::{and, and_kleene, is_not_null, is_null, not, or, or_kleene}; +use arrow_array::cast::AsArray; +use arrow_array::types::{Float32Type, Float64Type}; +use arrow_array::{Array, ArrayRef, BooleanArray, Datum as ArrowDatum, RecordBatch, Scalar}; +use arrow_buffer::BooleanBuffer; +use arrow_cast::cast::cast; +use arrow_ord::cmp::{eq, gt, gt_eq, lt, lt_eq, neq}; +use arrow_schema::{ArrowError, DataType}; +use arrow_string::like::starts_with; +use fnv::FnvHashSet; +use parquet::schema::types::SchemaDescriptor; + +use crate::arrow::get_arrow_datum; +use crate::error::Result; +use crate::expr::visitors::bound_predicate_visitor::BoundPredicateVisitor; +use crate::expr::{BoundPredicate, BoundReference}; +use crate::spec::Datum; +use crate::{Error, ErrorKind}; + +/// A visitor to collect field ids from bound predicates. +pub(super) struct CollectFieldIdVisitor { + pub(super) field_ids: HashSet, +} + +impl CollectFieldIdVisitor { + pub(super) fn field_ids(self) -> HashSet { + self.field_ids + } +} + +impl BoundPredicateVisitor for CollectFieldIdVisitor { + type T = (); + + fn always_true(&mut self) -> Result<()> { + Ok(()) + } + + fn always_false(&mut self) -> Result<()> { + Ok(()) + } + + fn and(&mut self, _lhs: (), _rhs: ()) -> Result<()> { + Ok(()) + } + + fn or(&mut self, _lhs: (), _rhs: ()) -> Result<()> { + Ok(()) + } + + fn not(&mut self, _inner: ()) -> Result<()> { + Ok(()) + } + + fn is_null(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn not_null(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn is_nan(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn not_nan(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn less_than( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn less_than_or_eq( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn greater_than( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn greater_than_or_eq( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn eq( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn not_eq( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn starts_with( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn not_starts_with( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn r#in( + &mut self, + reference: &BoundReference, + _literals: &FnvHashSet, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn not_in( + &mut self, + reference: &BoundReference, + _literals: &FnvHashSet, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } +} + +/// A visitor to convert Iceberg bound predicates to Arrow predicates. +pub(super) struct PredicateConverter<'a> { + /// The Parquet schema descriptor. + pub(super) parquet_schema: &'a SchemaDescriptor, + /// The map between field id and leaf column index in Parquet schema. + pub(super) column_map: &'a HashMap, + /// The required column indices in Parquet schema for the predicates. + pub(super) column_indices: &'a Vec, +} + +impl PredicateConverter<'_> { + /// When visiting a bound reference, we return index of the leaf column in the + /// required column indices which is used to project the column in the record batch. + /// Return None if the field id is not found in the column map, which is possible + /// due to schema evolution. + fn bound_reference(&mut self, reference: &BoundReference) -> Result> { + // The leaf column's index in Parquet schema. + if let Some(column_idx) = self.column_map.get(&reference.field().id) { + if self.parquet_schema.get_column_root(*column_idx).is_group() { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Leaf column `{}` in predicates isn't a root column in Parquet schema.", + reference.field().name + ), + )); + } + + // The leaf column's index in the required column indices. + let index = self + .column_indices + .iter() + .position(|&idx| idx == *column_idx) + .ok_or(Error::new( + ErrorKind::DataInvalid, + format!( + "Leaf column `{}` in predicates cannot be found in the required column indices.", + reference.field().name + ), + ))?; + + Ok(Some(index)) + } else { + Ok(None) + } + } + + /// Build an Arrow predicate that always returns true. + fn build_always_true(&self) -> Result> { + Ok(Box::new(|batch| { + Ok(BooleanArray::from(vec![true; batch.num_rows()])) + })) + } + + /// Build an Arrow predicate that always returns false. + fn build_always_false(&self) -> Result> { + Ok(Box::new(|batch| { + Ok(BooleanArray::from(vec![false; batch.num_rows()])) + })) + } +} + +/// Gets the leaf column from the record batch for the required column index. Only +/// supports top-level columns for now. +fn project_column( + batch: &RecordBatch, + column_idx: usize, +) -> std::result::Result { + let column = batch.column(column_idx); + + match column.data_type() { + DataType::Struct(_) => Err(ArrowError::SchemaError( + "Does not support struct column yet.".to_string(), + )), + _ => Ok(column.clone()), + } +} + +fn compute_is_nan(array: &ArrayRef) -> std::result::Result { + // Compute NaN over the contiguous values slice, then fold the null bitmap + // in with a single bitwise AND so that null slots become false. + let (is_nan, nulls) = match array.data_type() { + DataType::Float32 => { + let arr = array.as_primitive::(); + ( + BooleanBuffer::from_iter(arr.values().iter().map(|v| v.is_nan())), + arr.nulls(), + ) + } + DataType::Float64 => { + let arr = array.as_primitive::(); + ( + BooleanBuffer::from_iter(arr.values().iter().map(|v| v.is_nan())), + arr.nulls(), + ) + } + _ => unreachable!("is_nan is only valid for float types"), + }; + + let values = match nulls { + Some(nulls) => &is_nan & nulls.inner(), + None => is_nan, + }; + + Ok(BooleanArray::new(values, None)) +} + +pub(super) type PredicateResult = + dyn FnMut(RecordBatch) -> std::result::Result + Send + 'static; + +impl BoundPredicateVisitor for PredicateConverter<'_> { + type T = Box; + + fn always_true(&mut self) -> Result> { + self.build_always_true() + } + + fn always_false(&mut self) -> Result> { + self.build_always_false() + } + + fn and( + &mut self, + mut lhs: Box, + mut rhs: Box, + ) -> Result> { + Ok(Box::new(move |batch| { + let left = lhs(batch.clone())?; + let right = rhs(batch)?; + and_kleene(&left, &right) + })) + } + + fn or( + &mut self, + mut lhs: Box, + mut rhs: Box, + ) -> Result> { + Ok(Box::new(move |batch| { + let left = lhs(batch.clone())?; + let right = rhs(batch)?; + or_kleene(&left, &right) + })) + } + + fn not(&mut self, mut inner: Box) -> Result> { + Ok(Box::new(move |batch| { + let pred_ret = inner(batch)?; + not(&pred_ret) + })) + } + + fn is_null( + &mut self, + reference: &BoundReference, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + Ok(Box::new(move |batch| { + let column = project_column(&batch, idx)?; + is_null(&column) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } + + fn not_null( + &mut self, + reference: &BoundReference, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + Ok(Box::new(move |batch| { + let column = project_column(&batch, idx)?; + is_not_null(&column) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn is_nan( + &mut self, + reference: &BoundReference, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + Ok(Box::new(move |batch| { + let column = project_column(&batch, idx)?; + compute_is_nan(&column) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn not_nan( + &mut self, + reference: &BoundReference, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + Ok(Box::new(move |batch| { + let column = project_column(&batch, idx)?; + let is_nan = compute_is_nan(&column)?; + not(&is_nan) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } + + fn less_than( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + lt(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } + + fn less_than_or_eq( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + lt_eq(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } + + fn greater_than( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + gt(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn greater_than_or_eq( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + gt_eq(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn eq( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + eq(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn not_eq( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + neq(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn starts_with( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + starts_with(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn not_starts_with( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + // update here if arrow ever adds a native not_starts_with + not(&starts_with(&left, literal.as_ref())?) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } + + fn r#in( + &mut self, + reference: &BoundReference, + literals: &FnvHashSet, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literals: Vec<_> = literals + .iter() + .map(|lit| get_arrow_datum(lit).unwrap()) + .collect(); + + Ok(Box::new(move |batch| { + // update this if arrow ever adds a native is_in kernel + let left = project_column(&batch, idx)?; + + let mut acc = BooleanArray::from(vec![false; batch.num_rows()]); + for literal in &literals { + let literal = try_cast_literal(literal, left.data_type())?; + acc = or(&acc, &eq(&left, literal.as_ref())?)? + } + + Ok(acc) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn not_in( + &mut self, + reference: &BoundReference, + literals: &FnvHashSet, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literals: Vec<_> = literals + .iter() + .map(|lit| get_arrow_datum(lit).unwrap()) + .collect(); + + Ok(Box::new(move |batch| { + // update this if arrow ever adds a native not_in kernel + let left = project_column(&batch, idx)?; + let mut acc = BooleanArray::from(vec![true; batch.num_rows()]); + for literal in &literals { + let literal = try_cast_literal(literal, left.data_type())?; + acc = and(&acc, &neq(&left, literal.as_ref())?)? + } + + Ok(acc) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } +} + +/// The Arrow type of an array that the Parquet reader reads may not match the exact Arrow type +/// that Iceberg uses for literals - but they are effectively the same logical type, +/// i.e. LargeUtf8 and Utf8 or Utf8View and Utf8 or Utf8View and LargeUtf8. +/// +/// The Arrow compute kernels that we use must match the type exactly, so first cast the literal +/// into the type of the batch we read from Parquet before sending it to the compute kernel. +fn try_cast_literal( + literal: &Arc, + column_type: &DataType, +) -> std::result::Result, ArrowError> { + let literal_array = literal.get().0; + + // No cast required + if literal_array.data_type() == column_type { + return Ok(Arc::clone(literal)); + } + + let literal_array = cast(literal_array, column_type)?; + Ok(Arc::new(Scalar::new(literal_array))) +} + +#[cfg(test)] +mod tests { + use std::collections::{HashMap, HashSet}; + use std::sync::Arc; + + use arrow_array::{Array, BooleanArray, RecordBatch}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use parquet::schema::parser::parse_message_type; + use parquet::schema::types::SchemaDescriptor; + + use super::{CollectFieldIdVisitor, PredicateConverter}; + use crate::expr::visitors::bound_predicate_visitor::visit; + use crate::expr::{Bind, Predicate, Reference}; + use crate::spec::{NestedField, PrimitiveType, Schema, SchemaRef, Type}; + + fn table_schema_simple() -> SchemaRef { + Arc::new( + Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![2]) + .with_fields(vec![ + NestedField::optional(1, "foo", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "bar", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(3, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), + NestedField::optional(4, "qux", Type::Primitive(PrimitiveType::Float)).into(), + ]) + .build() + .unwrap(), + ) + } + + #[test] + fn test_collect_field_id() { + let schema = table_schema_simple(); + let expr = Reference::new("qux").is_null(); + let bound_expr = expr.bind(schema, true).unwrap(); + + let mut visitor = CollectFieldIdVisitor { + field_ids: HashSet::default(), + }; + visit(&mut visitor, &bound_expr).unwrap(); + + let mut expected = HashSet::default(); + expected.insert(4_i32); + + assert_eq!(visitor.field_ids, expected); + } + + #[test] + fn test_collect_field_id_with_and() { + let schema = table_schema_simple(); + let expr = Reference::new("qux") + .is_null() + .and(Reference::new("baz").is_null()); + let bound_expr = expr.bind(schema, true).unwrap(); + + let mut visitor = CollectFieldIdVisitor { + field_ids: HashSet::default(), + }; + visit(&mut visitor, &bound_expr).unwrap(); + + let mut expected = HashSet::default(); + expected.insert(4_i32); + expected.insert(3); + + assert_eq!(visitor.field_ids, expected); + } + + #[test] + fn test_collect_field_id_with_or() { + let schema = table_schema_simple(); + let expr = Reference::new("qux") + .is_null() + .or(Reference::new("baz").is_null()); + let bound_expr = expr.bind(schema, true).unwrap(); + + let mut visitor = CollectFieldIdVisitor { + field_ids: HashSet::default(), + }; + visit(&mut visitor, &bound_expr).unwrap(); + + let mut expected = HashSet::default(); + expected.insert(4_i32); + expected.insert(3); + + assert_eq!(visitor.field_ids, expected); + } + + fn apply_predicate_to_batch( + predicate: Predicate, + schema: SchemaRef, + batch: RecordBatch, + ) -> BooleanArray { + let bound = predicate.bind(schema, true).unwrap(); + + // Build a trivial Parquet schema with one float column at field id 4 + let message_type = " + message schema { + optional float qux = 4; + } + "; + let parquet_type = parse_message_type(message_type).expect("parse schema"); + let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_type)); + + let column_map = HashMap::from([(4i32, 0usize)]); + let column_indices = vec![0usize]; + + let mut converter = PredicateConverter { + parquet_schema: &parquet_schema, + column_map: &column_map, + column_indices: &column_indices, + }; + + let mut predicate_fn = visit(&mut converter, &bound).unwrap(); + predicate_fn(batch).unwrap() + } + + #[test] + fn test_predicate_converter_nan() { + use arrow_array::Float32Array; + + let schema = table_schema_simple(); + let arrow_schema = Arc::new(ArrowSchema::new(vec![Field::new( + "qux", + DataType::Float32, + true, + )])); + let values = vec![Some(1.0f32), Some(f32::NAN), None, Some(0.0f32)]; + + // is_nan: non-null-propagating per Java's implementation - NULL → false + let batch = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Float32Array::from( + values.clone(), + ))]) + .unwrap(); + let result = + apply_predicate_to_batch(Reference::new("qux").is_nan(), schema.clone(), batch); + assert_eq!( + [ + result.value(0), + result.value(1), + result.value(2), + result.value(3) + ], + [false, true, false, false] + ); + assert!(!result.is_null(2)); + + // not_nan: non-null-propagating per Java's implementation - NULL → true + let batch = + RecordBatch::try_new(arrow_schema, vec![Arc::new(Float32Array::from(values))]).unwrap(); + let result = apply_predicate_to_batch(Reference::new("qux").is_not_nan(), schema, batch); + assert_eq!( + [ + result.value(0), + result.value(1), + result.value(2), + result.value(3) + ], + [true, false, true, true] + ); + assert!(!result.is_null(2)); + } +} diff --git a/crates/iceberg/src/arrow/reader/projection.rs b/crates/iceberg/src/arrow/reader/projection.rs new file mode 100644 index 0000000000..d3fa00b84b --- /dev/null +++ b/crates/iceberg/src/arrow/reader/projection.rs @@ -0,0 +1,1718 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Column projection for `ArrowReader`: building the Parquet projection mask +//! from Iceberg field IDs, and mapping field IDs between Iceberg and Parquet +//! (including fallback handling for files without embedded IDs). + +use std::collections::{HashMap, HashSet}; +use std::str::FromStr; +use std::sync::Arc; + +use arrow_schema::{Field, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; +use parquet::arrow::{PARQUET_FIELD_ID_META_KEY, ProjectionMask}; +use parquet::schema::types::{SchemaDescriptor, Type as ParquetType}; + +use super::{ArrowReader, CollectFieldIdVisitor}; +use crate::arrow::arrow_schema_to_schema; +use crate::error::Result; +use crate::expr::BoundPredicate; +use crate::expr::visitors::bound_predicate_visitor::visit; +use crate::spec::{NameMapping, NestedField, PrimitiveType, Schema, Type}; +use crate::{Error, ErrorKind}; + +impl ArrowReader { + pub(super) fn build_field_id_set_and_map( + parquet_schema: &SchemaDescriptor, + predicate: &BoundPredicate, + ) -> Result<(HashSet, HashMap)> { + // Collects all Iceberg field IDs referenced in the filter predicate + let mut collector = CollectFieldIdVisitor { + field_ids: HashSet::default(), + }; + visit(&mut collector, predicate)?; + + let iceberg_field_ids = collector.field_ids(); + + // Without embedded field IDs, we fall back to position-based mapping for compatibility + let field_id_map = match build_field_id_map(parquet_schema)? { + Some(map) => map, + None => build_fallback_field_id_map(parquet_schema), + }; + + Ok((iceberg_field_ids, field_id_map)) + } + + /// Recursively extract leaf field IDs because Parquet projection works at the leaf column level. + /// Nested types (struct/list/map) are flattened in Parquet's columnar format. + fn include_leaf_field_id(field: &NestedField, field_ids: &mut Vec) { + match field.field_type.as_ref() { + Type::Primitive(_) => { + field_ids.push(field.id); + } + Type::Struct(struct_type) => { + for nested_field in struct_type.fields() { + Self::include_leaf_field_id(nested_field, field_ids); + } + } + Type::List(list_type) => { + Self::include_leaf_field_id(&list_type.element_field, field_ids); + } + Type::Map(map_type) => { + Self::include_leaf_field_id(&map_type.key_field, field_ids); + Self::include_leaf_field_id(&map_type.value_field, field_ids); + } + } + } + + pub(super) fn get_arrow_projection_mask( + field_ids: &[i32], + iceberg_schema_of_task: &Schema, + parquet_schema: &SchemaDescriptor, + arrow_schema: &ArrowSchemaRef, + use_fallback: bool, // Whether file lacks embedded field IDs (e.g., migrated from Hive/Spark) + ) -> Result { + fn type_promotion_is_valid( + file_type: Option<&PrimitiveType>, + projected_type: Option<&PrimitiveType>, + ) -> bool { + match (file_type, projected_type) { + (Some(lhs), Some(rhs)) if lhs == rhs => true, + (Some(PrimitiveType::Int), Some(PrimitiveType::Long)) => true, + (Some(PrimitiveType::Float), Some(PrimitiveType::Double)) => true, + ( + Some(PrimitiveType::Decimal { + precision: file_precision, + scale: file_scale, + }), + Some(PrimitiveType::Decimal { + precision: requested_precision, + scale: requested_scale, + }), + ) if requested_precision >= file_precision && file_scale == requested_scale => true, + // Uuid will be store as Fixed(16) in parquet file, so the read back type will be Fixed(16). + (Some(PrimitiveType::Fixed(16)), Some(PrimitiveType::Uuid)) => true, + _ => false, + } + } + + if field_ids.is_empty() { + return Ok(ProjectionMask::all()); + } + + if use_fallback { + // Position-based projection necessary because file lacks embedded field IDs + Self::get_arrow_projection_mask_fallback(field_ids, parquet_schema) + } else { + // Field-ID-based projection using embedded field IDs from Parquet metadata + + // Parquet's columnar format requires leaf-level (not top-level struct/list/map) projection + let mut leaf_field_ids = vec![]; + for field_id in field_ids { + let field = iceberg_schema_of_task.field_by_id(*field_id); + if let Some(field) = field { + Self::include_leaf_field_id(field, &mut leaf_field_ids); + } + } + + Self::get_arrow_projection_mask_with_field_ids( + &leaf_field_ids, + iceberg_schema_of_task, + parquet_schema, + arrow_schema, + type_promotion_is_valid, + ) + } + } + + /// Standard projection using embedded field IDs from Parquet metadata. + /// For iceberg-java compatibility with ParquetSchemaUtil.pruneColumns(). + fn get_arrow_projection_mask_with_field_ids( + leaf_field_ids: &[i32], + iceberg_schema_of_task: &Schema, + parquet_schema: &SchemaDescriptor, + arrow_schema: &ArrowSchemaRef, + type_promotion_is_valid: fn(Option<&PrimitiveType>, Option<&PrimitiveType>) -> bool, + ) -> Result { + let mut column_map = HashMap::new(); + let fields = arrow_schema.fields(); + + // Pre-project only the fields that have been selected, possibly avoiding converting + // some Arrow types that are not yet supported. + let mut projected_fields: HashMap = HashMap::new(); + let projected_arrow_schema = ArrowSchema::new_with_metadata( + fields.filter_leaves(|_, f| { + f.metadata() + .get(PARQUET_FIELD_ID_META_KEY) + .and_then(|field_id| i32::from_str(field_id).ok()) + .is_some_and(|field_id| { + projected_fields.insert((*f).clone(), field_id); + leaf_field_ids.contains(&field_id) + }) + }), + arrow_schema.metadata().clone(), + ); + let iceberg_schema = arrow_schema_to_schema(&projected_arrow_schema)?; + + fields.filter_leaves(|idx, field| { + let Some(field_id) = projected_fields.get(field).cloned() else { + return false; + }; + + let iceberg_field = iceberg_schema_of_task.field_by_id(field_id); + let parquet_iceberg_field = iceberg_schema.field_by_id(field_id); + + if iceberg_field.is_none() || parquet_iceberg_field.is_none() { + return false; + } + + if !type_promotion_is_valid( + parquet_iceberg_field + .unwrap() + .field_type + .as_primitive_type(), + iceberg_field.unwrap().field_type.as_primitive_type(), + ) { + return false; + } + + column_map.insert(field_id, idx); + true + }); + + // Schema evolution: New columns may not exist in old Parquet files. + // We only project existing columns; RecordBatchTransformer adds default/NULL values. + let mut indices = vec![]; + for field_id in leaf_field_ids { + if let Some(col_idx) = column_map.get(field_id) { + indices.push(*col_idx); + } + } + + if indices.is_empty() { + // Edge case: All requested columns are new (don't exist in file). + // Project all columns so RecordBatchTransformer has a batch to transform. + Ok(ProjectionMask::all()) + } else { + Ok(ProjectionMask::leaves(parquet_schema, indices)) + } + } + + /// Fallback projection for Parquet files without field IDs. + /// Uses position-based matching: field ID N → column position N-1. + /// Projects entire top-level columns (including nested content) for iceberg-java compatibility. + fn get_arrow_projection_mask_fallback( + field_ids: &[i32], + parquet_schema: &SchemaDescriptor, + ) -> Result { + // Position-based: field_id N → column N-1 (field IDs are 1-indexed) + let parquet_root_fields = parquet_schema.root_schema().get_fields(); + let mut root_indices = vec![]; + + for field_id in field_ids.iter() { + let parquet_pos = (*field_id - 1) as usize; + + if parquet_pos < parquet_root_fields.len() { + root_indices.push(parquet_pos); + } + // RecordBatchTransformer adds missing columns with NULL values + } + + if root_indices.is_empty() { + Ok(ProjectionMask::all()) + } else { + Ok(ProjectionMask::roots(parquet_schema, root_indices)) + } + } +} + +/// Build the map of parquet field id to Parquet column index in the schema. +/// Returns None if the Parquet file doesn't have field IDs embedded (e.g., migrated tables). +pub(super) fn build_field_id_map( + parquet_schema: &SchemaDescriptor, +) -> Result>> { + let mut column_map = HashMap::new(); + + for (idx, field) in parquet_schema.columns().iter().enumerate() { + let field_type = field.self_type(); + match field_type { + ParquetType::PrimitiveType { basic_info, .. } => { + if !basic_info.has_id() { + return Ok(None); + } + column_map.insert(basic_info.id(), idx); + } + ParquetType::GroupType { .. } => { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Leaf column in schema should be primitive type but got {field_type:?}" + ), + )); + } + }; + } + + Ok(Some(column_map)) +} + +/// Build a fallback field ID map for Parquet files without embedded field IDs. +/// +/// Returns the number of primitive (leaf) columns in a Parquet type, recursing into groups. +fn leaf_count(ty: &parquet::schema::types::Type) -> usize { + if ty.is_primitive() { + 1 + } else { + ty.get_fields().iter().map(|f| leaf_count(f)).sum() + } +} + +/// Builds a mapping from fallback field IDs to leaf column indices for Parquet files +/// without embedded field IDs. Returns entries only for primitive top-level fields. +/// +/// Must use top-level field positions (not leaf column positions) to stay consistent +/// with `add_fallback_field_ids_to_arrow_schema`, which assigns ordinal IDs to +/// top-level Arrow fields. Using leaf positions instead would produce wrong indices +/// when nested types (struct/list/map) expand into multiple leaf columns. +/// +/// Mirrors iceberg-java's ParquetSchemaUtil.addFallbackIds() which iterates +/// fileSchema.getFields() assigning ordinal IDs to top-level fields. +pub(super) fn build_fallback_field_id_map( + parquet_schema: &SchemaDescriptor, +) -> HashMap { + let mut column_map = HashMap::new(); + let mut leaf_idx = 0; + + for (top_pos, field) in parquet_schema.root_schema().get_fields().iter().enumerate() { + let field_id = (top_pos + 1) as i32; + if field.is_primitive() { + column_map.insert(field_id, leaf_idx); + } + leaf_idx += leaf_count(field); + } + + column_map +} + +/// Apply name mapping to Arrow schema for Parquet files lacking field IDs. +/// +/// Assigns Iceberg field IDs based on column names using the name mapping, +/// enabling correct projection on migrated files (e.g., from Hive/Spark via add_files). +/// +/// Per Iceberg spec Column Projection rule #2: +/// "Use schema.name-mapping.default metadata to map field id to columns without field id" +/// https://iceberg.apache.org/spec/#column-projection +/// +/// Corresponds to Java's ParquetSchemaUtil.applyNameMapping() and ApplyNameMapping visitor. +/// The key difference is Java operates on Parquet MessageType, while we operate on Arrow Schema. +/// +/// # Arguments +/// * `arrow_schema` - Arrow schema from Parquet file (without field IDs) +/// * `name_mapping` - Name mapping from table metadata (TableProperties.DEFAULT_NAME_MAPPING) +/// +/// # Returns +/// Arrow schema with field IDs assigned based on name mapping +pub(super) fn apply_name_mapping_to_arrow_schema( + arrow_schema: ArrowSchemaRef, + name_mapping: &NameMapping, +) -> Result> { + debug_assert!( + arrow_schema + .fields() + .iter() + .next() + .is_none_or(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()), + "Schema already has field IDs - name mapping should not be applied" + ); + + let fields_with_mapped_ids: Vec<_> = arrow_schema + .fields() + .iter() + .map(|field| { + // Look up this column name in name mapping to get the Iceberg field ID. + // Corresponds to Java's ApplyNameMapping visitor which calls + // nameMapping.find(currentPath()) and returns field.withId() if found. + // + // If the field isn't in the mapping, leave it WITHOUT assigning an ID + // (matching Java's behavior of returning the field unchanged). + // Later, during projection, fields without IDs are filtered out. + let mapped_field_opt = name_mapping + .fields() + .iter() + .find(|f| f.names().contains(&field.name().to_string())); + + let mut metadata = field.metadata().clone(); + + if let Some(mapped_field) = mapped_field_opt + && let Some(field_id) = mapped_field.field_id() + { + // Field found in mapping with a field_id → assign it + metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(), field_id.to_string()); + } + // If field_id is None, leave the field without an ID (will be filtered by projection) + + Field::new(field.name(), field.data_type().clone(), field.is_nullable()) + .with_metadata(metadata) + }) + .collect(); + + Ok(Arc::new(ArrowSchema::new_with_metadata( + fields_with_mapped_ids, + arrow_schema.metadata().clone(), + ))) +} + +/// Add position-based fallback field IDs to Arrow schema for Parquet files lacking them. +/// Enables projection on migrated files (e.g., from Hive/Spark). +/// +/// Why at schema level (not per-batch): Efficiency - avoids repeated schema modification. +/// Why only top-level: Nested projection uses leaf column indices, not parent struct IDs. +/// Why 1-indexed: Compatibility with iceberg-java's ParquetSchemaUtil.addFallbackIds(). +pub(super) fn add_fallback_field_ids_to_arrow_schema( + arrow_schema: &ArrowSchemaRef, +) -> Arc { + debug_assert!( + arrow_schema + .fields() + .iter() + .next() + .is_none_or(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()), + "Schema already has field IDs" + ); + + let fields_with_fallback_ids: Vec<_> = arrow_schema + .fields() + .iter() + .enumerate() + .map(|(pos, field)| { + let mut metadata = field.metadata().clone(); + let field_id = (pos + 1) as i32; // 1-indexed for Java compatibility + metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(), field_id.to_string()); + + Field::new(field.name(), field.data_type().clone(), field.is_nullable()) + .with_metadata(metadata) + }) + .collect(); + + Arc::new(ArrowSchema::new_with_metadata( + fields_with_fallback_ids, + arrow_schema.metadata().clone(), + )) +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::fs::File; + use std::sync::Arc; + + use arrow_array::cast::AsArray; + use arrow_array::{ArrayRef, RecordBatch, StringArray}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; + use futures::TryStreamExt; + use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY, ProjectionMask}; + use parquet::basic::Compression; + use parquet::file::properties::WriterProperties; + use parquet::schema::parser::parse_message_type; + use parquet::schema::types::SchemaDescriptor; + use tempfile::TempDir; + + use crate::ErrorKind; + use crate::arrow::{ArrowReader, ArrowReaderBuilder}; + use crate::expr::{Bind, Reference}; + use crate::io::FileIO; + use crate::scan::{FileScanTask, FileScanTaskStream}; + use crate::spec::{DataFileFormat, Datum, NestedField, PrimitiveType, Schema, Type}; + + #[test] + fn test_arrow_projection_mask() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![1]) + .with_fields(vec![ + NestedField::required(1, "c1", Type::Primitive(PrimitiveType::String)).into(), + NestedField::optional(2, "c2", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional( + 3, + "c3", + Type::Primitive(PrimitiveType::Decimal { + precision: 38, + scale: 3, + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("c1", DataType::Utf8, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + // Type not supported + Field::new("c2", DataType::Duration(TimeUnit::Microsecond), true).with_metadata( + HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "2".to_string())]), + ), + // Precision is beyond the supported range + Field::new("c3", DataType::Decimal128(39, 3), true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "3".to_string(), + )])), + ])); + + let message_type = " +message schema { + required binary c1 (STRING) = 1; + optional int32 c2 (INTEGER(8,true)) = 2; + optional fixed_len_byte_array(17) c3 (DECIMAL(39,3)) = 3; +} + "; + let parquet_type = parse_message_type(message_type).expect("should parse schema"); + let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_type)); + + // Try projecting the fields c2 and c3 with the unsupported data types + let err = ArrowReader::get_arrow_projection_mask( + &[1, 2, 3], + &schema, + &parquet_schema, + &arrow_schema, + false, + ) + .unwrap_err(); + + assert_eq!(err.kind(), ErrorKind::DataInvalid); + assert_eq!( + err.to_string(), + "DataInvalid => Unsupported Arrow data type: Duration(µs)".to_string() + ); + + // Omitting field c2, we still get an error due to c3 being selected + let err = ArrowReader::get_arrow_projection_mask( + &[1, 3], + &schema, + &parquet_schema, + &arrow_schema, + false, + ) + .unwrap_err(); + + assert_eq!(err.kind(), ErrorKind::DataInvalid); + assert_eq!( + err.to_string(), + "DataInvalid => Failed to create decimal type, source: DataInvalid => Decimals with precision larger than 38 are not supported: 39".to_string() + ); + + // Finally avoid selecting fields with unsupported data types + let mask = ArrowReader::get_arrow_projection_mask( + &[1], + &schema, + &parquet_schema, + &arrow_schema, + false, + ) + .expect("Some ProjectionMask"); + assert_eq!(mask, ProjectionMask::leaves(&parquet_schema, vec![0])); + } + + /// Test schema evolution: reading old Parquet file (with only column 'a') + /// using a newer table schema (with columns 'a' and 'b'). + /// This tests that: + /// 1. get_arrow_projection_mask allows missing columns + /// 2. RecordBatchTransformer adds missing column 'b' with NULL values + #[tokio::test] + async fn test_schema_evolution_add_column() { + use arrow_array::{Array, Int32Array}; + + // New table schema: columns 'a' and 'b' (b was added later, file only has 'a') + let new_schema = Arc::new( + Schema::builder() + .with_schema_id(2) + .with_fields(vec![ + NestedField::required(1, "a", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(2, "b", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + // Create Arrow schema for old Parquet file (only has column 'a') + let arrow_schema_old = Arc::new(ArrowSchema::new(vec![ + Field::new("a", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + // Write old Parquet file with only column 'a' + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let data_a = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef; + let to_write = RecordBatch::try_new(arrow_schema_old.clone(), vec![data_a]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let file = File::create(format!("{table_location}/old_file.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + // Read the old Parquet file using the NEW schema (with column 'b') + let reader = ArrowReaderBuilder::new(file_io).build(); + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/old_file.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/old_file.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: new_schema.clone(), + project_field_ids: vec![1, 2], // Request both columns 'a' and 'b' + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Verify we got the correct data + assert_eq!(result.len(), 1); + let batch = &result[0]; + + // Should have 2 columns now + assert_eq!(batch.num_columns(), 2); + assert_eq!(batch.num_rows(), 3); + + // Column 'a' should have the original data + let col_a = batch + .column(0) + .as_primitive::(); + assert_eq!(col_a.values(), &[1, 2, 3]); + + // Column 'b' should be all NULLs (it didn't exist in the old file) + let col_b = batch + .column(1) + .as_primitive::(); + assert_eq!(col_b.null_count(), 3); + assert!(col_b.is_null(0)); + assert!(col_b.is_null(1)); + assert!(col_b.is_null(2)); + } + + /// Test reading Parquet files without field ID metadata (e.g., migrated tables). + /// This exercises the position-based fallback path. + /// + /// Corresponds to Java's ParquetSchemaUtil.addFallbackIds() + pruneColumnsFallback() + /// in /parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java + #[tokio::test] + async fn test_read_parquet_file_without_field_ids() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "age", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + // Parquet file from a migrated table - no field ID metadata + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("age", DataType::Int32, false), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let name_data = vec!["Alice", "Bob", "Charlie"]; + let age_data = vec![30, 25, 35]; + + use arrow_array::Int32Array; + let name_col = Arc::new(StringArray::from(name_data.clone())) as ArrayRef; + let age_col = Arc::new(Int32Array::from(age_data.clone())) as ArrayRef; + + let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![name_col, age_col]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + assert_eq!(result.len(), 1); + let batch = &result[0]; + assert_eq!(batch.num_rows(), 3); + assert_eq!(batch.num_columns(), 2); + + // Verify position-based mapping: field_id 1 → position 0, field_id 2 → position 1 + let name_array = batch.column(0).as_string::(); + assert_eq!(name_array.value(0), "Alice"); + assert_eq!(name_array.value(1), "Bob"); + assert_eq!(name_array.value(2), "Charlie"); + + let age_array = batch + .column(1) + .as_primitive::(); + assert_eq!(age_array.value(0), 30); + assert_eq!(age_array.value(1), 25); + assert_eq!(age_array.value(2), 35); + } + + /// Test reading Parquet files without field IDs with partial projection. + /// Only a subset of columns are requested, verifying position-based fallback + /// handles column selection correctly. + #[tokio::test] + async fn test_read_parquet_without_field_ids_partial_projection() { + use arrow_array::Int32Array; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "col1", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "col2", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(3, "col3", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(4, "col4", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("col1", DataType::Utf8, false), + Field::new("col2", DataType::Int32, false), + Field::new("col3", DataType::Utf8, false), + Field::new("col4", DataType::Int32, false), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let col1_data = Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef; + let col2_data = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef; + let col3_data = Arc::new(StringArray::from(vec!["c", "d"])) as ArrayRef; + let col4_data = Arc::new(Int32Array::from(vec![30, 40])) as ArrayRef; + + let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![ + col1_data, col2_data, col3_data, col4_data, + ]) + .unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 3], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + assert_eq!(result.len(), 1); + let batch = &result[0]; + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 2); + + let col1_array = batch.column(0).as_string::(); + assert_eq!(col1_array.value(0), "a"); + assert_eq!(col1_array.value(1), "b"); + + let col3_array = batch.column(1).as_string::(); + assert_eq!(col3_array.value(0), "c"); + assert_eq!(col3_array.value(1), "d"); + } + + /// Test reading Parquet files without field IDs with schema evolution. + /// The Iceberg schema has more fields than the Parquet file, testing that + /// missing columns are filled with NULLs. + #[tokio::test] + async fn test_read_parquet_without_field_ids_schema_evolution() { + use arrow_array::{Array, Int32Array}; + + // Schema with field 3 added after the file was written + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "age", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(3, "city", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("age", DataType::Int32, false), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let name_data = Arc::new(StringArray::from(vec!["Alice", "Bob"])) as ArrayRef; + let age_data = Arc::new(Int32Array::from(vec![30, 25])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema.clone(), vec![name_data, age_data]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2, 3], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + assert_eq!(result.len(), 1); + let batch = &result[0]; + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 3); + + let name_array = batch.column(0).as_string::(); + assert_eq!(name_array.value(0), "Alice"); + assert_eq!(name_array.value(1), "Bob"); + + let age_array = batch + .column(1) + .as_primitive::(); + assert_eq!(age_array.value(0), 30); + assert_eq!(age_array.value(1), 25); + + // Verify missing column filled with NULLs + let city_array = batch.column(2).as_string::(); + assert_eq!(city_array.null_count(), 2); + assert!(city_array.is_null(0)); + assert!(city_array.is_null(1)); + } + + /// Test reading Parquet files without field IDs that have multiple row groups. + /// This ensures the position-based fallback works correctly across row group boundaries. + #[tokio::test] + async fn test_read_parquet_without_field_ids_multiple_row_groups() { + use arrow_array::Int32Array; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "value", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("value", DataType::Int32, false), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + // Small row group size to create multiple row groups + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .set_write_batch_size(2) + .set_max_row_group_row_count(Some(2)) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); + + // Write 6 rows in 3 batches (will create 3 row groups) + for batch_num in 0..3 { + let name_data = Arc::new(StringArray::from(vec![ + format!("name_{}", batch_num * 2), + format!("name_{}", batch_num * 2 + 1), + ])) as ArrayRef; + let value_data = + Arc::new(Int32Array::from(vec![batch_num * 2, batch_num * 2 + 1])) as ArrayRef; + + let batch = + RecordBatch::try_new(arrow_schema.clone(), vec![name_data, value_data]).unwrap(); + writer.write(&batch).expect("Writing batch"); + } + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + assert!(!result.is_empty()); + + let mut all_names = Vec::new(); + let mut all_values = Vec::new(); + + for batch in &result { + let name_array = batch.column(0).as_string::(); + let value_array = batch + .column(1) + .as_primitive::(); + + for i in 0..batch.num_rows() { + all_names.push(name_array.value(i).to_string()); + all_values.push(value_array.value(i)); + } + } + + assert_eq!(all_names.len(), 6); + assert_eq!(all_values.len(), 6); + + for i in 0..6 { + assert_eq!(all_names[i], format!("name_{i}")); + assert_eq!(all_values[i], i as i32); + } + } + + /// Test reading Parquet files without field IDs with nested types (struct). + /// Java's pruneColumnsFallback() projects entire top-level columns including nested content. + /// This test verifies that a top-level struct field is projected correctly with all its nested fields. + #[tokio::test] + async fn test_read_parquet_without_field_ids_with_struct() { + use arrow_array::{Int32Array, StructArray}; + use arrow_schema::Fields; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required( + 2, + "person", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::required( + 3, + "name", + Type::Primitive(PrimitiveType::String), + ) + .into(), + NestedField::required(4, "age", Type::Primitive(PrimitiveType::Int)) + .into(), + ])), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new( + "person", + DataType::Struct(Fields::from(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("age", DataType::Int32, false), + ])), + false, + ), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let id_data = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef; + let name_data = Arc::new(StringArray::from(vec!["Alice", "Bob"])) as ArrayRef; + let age_data = Arc::new(Int32Array::from(vec![30, 25])) as ArrayRef; + let person_data = Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("name", DataType::Utf8, false)), + name_data, + ), + ( + Arc::new(Field::new("age", DataType::Int32, false)), + age_data, + ), + ])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema.clone(), vec![id_data, person_data]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + assert_eq!(result.len(), 1); + let batch = &result[0]; + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 2); + + let id_array = batch + .column(0) + .as_primitive::(); + assert_eq!(id_array.value(0), 1); + assert_eq!(id_array.value(1), 2); + + let person_array = batch.column(1).as_struct(); + assert_eq!(person_array.num_columns(), 2); + + let name_array = person_array.column(0).as_string::(); + assert_eq!(name_array.value(0), "Alice"); + assert_eq!(name_array.value(1), "Bob"); + + let age_array = person_array + .column(1) + .as_primitive::(); + assert_eq!(age_array.value(0), 30); + assert_eq!(age_array.value(1), 25); + } + + /// Test reading Parquet files without field IDs with schema evolution - column added in the middle. + /// When a new column is inserted between existing columns in the schema order, + /// the fallback projection must correctly map field IDs to output positions. + #[tokio::test] + async fn test_read_parquet_without_field_ids_schema_evolution_add_column_in_middle() { + use arrow_array::{Array, Int32Array}; + + let arrow_schema_old = Arc::new(ArrowSchema::new(vec![ + Field::new("col0", DataType::Int32, true), + Field::new("col1", DataType::Int32, true), + ])); + + // New column added between existing columns: col0 (id=1), newCol (id=5), col1 (id=2) + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "col0", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(5, "newCol", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(2, "col1", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let col0_data = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef; + let col1_data = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema_old.clone(), vec![col0_data, col1_data]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 5, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + assert_eq!(result.len(), 1); + let batch = &result[0]; + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 3); + + let result_col0 = batch + .column(0) + .as_primitive::(); + assert_eq!(result_col0.value(0), 1); + assert_eq!(result_col0.value(1), 2); + + // New column should be NULL (doesn't exist in old file) + let result_newcol = batch + .column(1) + .as_primitive::(); + assert_eq!(result_newcol.null_count(), 2); + assert!(result_newcol.is_null(0)); + assert!(result_newcol.is_null(1)); + + let result_col1 = batch + .column(2) + .as_primitive::(); + assert_eq!(result_col1.value(0), 10); + assert_eq!(result_col1.value(1), 20); + } + + /// Test reading Parquet files without field IDs with a filter that eliminates all row groups. + /// During development of field ID mapping, we saw a panic when row_selection_enabled=true and + /// all row groups are filtered out. + #[tokio::test] + async fn test_read_parquet_without_field_ids_filter_eliminates_all_rows() { + use arrow_array::{Float64Array, Int32Array}; + + // Schema with fields that will use fallback IDs 1, 2, 3 + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(3, "value", Type::Primitive(PrimitiveType::Double)) + .into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + Field::new("value", DataType::Float64, false), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + // Write data where all ids are >= 10 + let id_data = Arc::new(Int32Array::from(vec![10, 11, 12])) as ArrayRef; + let name_data = Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef; + let value_data = Arc::new(Float64Array::from(vec![100.0, 200.0, 300.0])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema.clone(), vec![id_data, name_data, value_data]) + .unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + // Filter that eliminates all row groups: id < 5 + let predicate = Reference::new("id").less_than(Datum::int(5)); + + // Enable both row_group_filtering and row_selection - triggered the panic + let reader = ArrowReaderBuilder::new(file_io) + .with_row_group_filtering_enabled(true) + .with_row_selection_enabled(true) + .build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2, 3], + predicate: Some(predicate.bind(schema, true).unwrap()), + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + // Should no longer panic + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Should return empty results + assert!(result.is_empty() || result.iter().all(|batch| batch.num_rows() == 0)); + } + + /// Test bucket partitioning reads source column from data file (not partition metadata). + /// + /// This is an integration test verifying the complete ArrowReader pipeline with bucket partitioning. + /// It corresponds to TestRuntimeFiltering tests in Iceberg Java (e.g., testRenamedSourceColumnTable). + /// + /// # Iceberg Spec Requirements + /// + /// Per the Iceberg spec "Column Projection" section: + /// > "Return the value from partition metadata if an **Identity Transform** exists for the field" + /// + /// This means: + /// - Identity transforms (e.g., `identity(dept)`) use constants from partition metadata + /// - Non-identity transforms (e.g., `bucket(4, id)`) must read source columns from data files + /// - Partition metadata for bucket transforms stores bucket numbers (0-3), NOT source values + /// + /// Java's PartitionUtil.constantsMap() implements this via: + /// ```java + /// if (field.transform().isIdentity()) { + /// idToConstant.put(field.sourceId(), converted); + /// } + /// ``` + /// + /// # What This Test Verifies + /// + /// This test ensures the full ArrowReader → RecordBatchTransformer pipeline correctly handles + /// bucket partitioning when FileScanTask provides partition_spec and partition_data: + /// + /// - Parquet file has field_id=1 named "id" with actual data [1, 5, 9, 13] + /// - FileScanTask specifies partition_spec with bucket(4, id) and partition_data with bucket=1 + /// - RecordBatchTransformer.constants_map() excludes bucket-partitioned field from constants + /// - ArrowReader correctly reads [1, 5, 9, 13] from the data file + /// - Values are NOT replaced with constant 1 from partition metadata + /// + /// # Why This Matters + /// + /// Without correct handling: + /// - Runtime filtering would break (e.g., `WHERE id = 5` would fail) + /// - Query results would be incorrect (all rows would have id=1) + /// - Bucket partitioning would be unusable for query optimization + /// + /// # References + /// - Iceberg spec: format/spec.md "Column Projection" + "Partition Transforms" + /// - Java test: spark/src/test/java/.../TestRuntimeFiltering.java + /// - Java impl: core/src/main/java/org/apache/iceberg/util/PartitionUtil.java + #[tokio::test] + async fn test_bucket_partitioning_reads_source_column_from_file() { + use arrow_array::Int32Array; + + use crate::spec::{Literal, PartitionSpec, Struct, Transform}; + + // Iceberg schema with id and name columns + let schema = Arc::new( + Schema::builder() + .with_schema_id(0) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(2, "name", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(), + ); + + // Partition spec: bucket(4, id) + let partition_spec = Arc::new( + PartitionSpec::builder(schema.clone()) + .with_spec_id(0) + .add_partition_field("id", "id_bucket", Transform::Bucket(4)) + .unwrap() + .build() + .unwrap(), + ); + + // Partition data: bucket value is 1 + let partition_data = Struct::from_iter(vec![Some(Literal::int(1))]); + + // Create Arrow schema with field IDs for Parquet file + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + Field::new("name", DataType::Utf8, true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "2".to_string(), + )])), + ])); + + // Write Parquet file with data + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let id_data = Arc::new(Int32Array::from(vec![1, 5, 9, 13])) as ArrayRef; + let name_data = + Arc::new(StringArray::from(vec!["Alice", "Bob", "Charlie", "Dave"])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema.clone(), vec![id_data, name_data]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let file = File::create(format!("{}/data.parquet", &table_location)).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + // Read the Parquet file with partition spec and data + let reader = ArrowReaderBuilder::new(file_io).build(); + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/data.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/data.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: Some(partition_data), + partition_spec: Some(partition_spec), + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Verify we got the correct data + assert_eq!(result.len(), 1); + let batch = &result[0]; + + assert_eq!(batch.num_columns(), 2); + assert_eq!(batch.num_rows(), 4); + + // The id column MUST contain actual values from the Parquet file [1, 5, 9, 13], + // NOT the constant partition value 1 + let id_col = batch + .column(0) + .as_primitive::(); + assert_eq!(id_col.value(0), 1); + assert_eq!(id_col.value(1), 5); + assert_eq!(id_col.value(2), 9); + assert_eq!(id_col.value(3), 13); + + let name_col = batch.column(1).as_string::(); + assert_eq!(name_col.value(0), "Alice"); + assert_eq!(name_col.value(1), "Bob"); + assert_eq!(name_col.value(2), "Charlie"); + assert_eq!(name_col.value(3), "Dave"); + } + + /// Regression for : + /// predicate on a column after nested types in a migrated file (no field IDs). + /// Schema has struct, list, and map columns before the predicate target (`id`), + /// exercising the fallback field ID mapping across all nested type variants. + #[tokio::test] + async fn test_predicate_on_migrated_file_with_nested_types() { + use serde::{Deserialize, Serialize}; + use serde_arrow::schema::{SchemaLike, TracingOptions}; + + #[derive(Serialize, Deserialize)] + struct Person { + name: String, + age: i32, + } + + #[derive(Serialize, Deserialize)] + struct Row { + person: Person, + people: Vec, + props: std::collections::BTreeMap, + id: i32, + } + + let rows = vec![ + Row { + person: Person { + name: "Alice".into(), + age: 30, + }, + people: vec![Person { + name: "Alice".into(), + age: 30, + }], + props: [("k1".into(), "v1".into())].into(), + id: 1, + }, + Row { + person: Person { + name: "Bob".into(), + age: 25, + }, + people: vec![Person { + name: "Bob".into(), + age: 25, + }], + props: [("k2".into(), "v2".into())].into(), + id: 2, + }, + Row { + person: Person { + name: "Carol".into(), + age: 40, + }, + people: vec![Person { + name: "Carol".into(), + age: 40, + }], + props: [("k3".into(), "v3".into())].into(), + id: 3, + }, + ]; + + let tracing_options = TracingOptions::default() + .map_as_struct(false) + .strings_as_large_utf8(false) + .sequence_as_large_list(false); + let fields = Vec::::from_type::(tracing_options).unwrap(); + let arrow_schema = Arc::new(ArrowSchema::new(fields.clone())); + let batch = serde_arrow::to_record_batch(&fields, &rows).unwrap(); + + // Fallback field IDs: person=1, people=2, props=3, id=4 + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required( + 1, + "person", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::required( + 5, + "name", + Type::Primitive(PrimitiveType::String), + ) + .into(), + NestedField::required(6, "age", Type::Primitive(PrimitiveType::Int)) + .into(), + ])), + ) + .into(), + NestedField::required( + 2, + "people", + Type::List(crate::spec::ListType { + element_field: NestedField::required( + 7, + "element", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::required( + 8, + "name", + Type::Primitive(PrimitiveType::String), + ) + .into(), + NestedField::required( + 9, + "age", + Type::Primitive(PrimitiveType::Int), + ) + .into(), + ])), + ) + .into(), + }), + ) + .into(), + NestedField::required( + 3, + "props", + Type::Map(crate::spec::MapType { + key_field: NestedField::required( + 10, + "key", + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::required( + 11, + "value", + Type::Primitive(PrimitiveType::String), + ) + .into(), + }), + ) + .into(), + NestedField::required(4, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/1.parquet"); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let file = File::create(&file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema, Some(props)).unwrap(); + writer.write(&batch).expect("Writing batch"); + writer.close().unwrap(); + + let predicate = Reference::new("id").greater_than(Datum::int(1)); + + let reader = ArrowReaderBuilder::new(FileIO::new_with_fs()) + .with_row_group_filtering_enabled(true) + .with_row_selection_enabled(true) + .build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), + start: 0, + length: 0, + record_count: None, + data_file_path: file_path, + data_file_format: DataFileFormat::Parquet, + schema: iceberg_schema.clone(), + project_field_ids: vec![4], + predicate: Some(predicate.bind(iceberg_schema, true).unwrap()), + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + let ids: Vec = result + .iter() + .flat_map(|b| { + b.column(0) + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect(); + assert_eq!(ids, vec![2, 3]); + } +} diff --git a/crates/iceberg/src/arrow/reader/row_filter.rs b/crates/iceberg/src/arrow/reader/row_filter.rs new file mode 100644 index 0000000000..52f7260cc6 --- /dev/null +++ b/crates/iceberg/src/arrow/reader/row_filter.rs @@ -0,0 +1,616 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Predicate-driven row filtering for `ArrowReader`: constructing Arrow `RowFilter`s +//! from Iceberg predicates, row-group selection based on column statistics, and +//! row-selection via the Parquet page index. Also includes byte-range row-group +//! filtering used for file splitting. + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use parquet::arrow::ProjectionMask; +use parquet::arrow::arrow_reader::{ArrowPredicateFn, RowFilter, RowSelection}; +use parquet::file::metadata::ParquetMetaData; +use parquet::schema::types::SchemaDescriptor; + +use super::{ArrowReader, PredicateConverter}; +use crate::error::Result; +use crate::expr::BoundPredicate; +use crate::expr::visitors::bound_predicate_visitor::visit; +use crate::expr::visitors::page_index_evaluator::PageIndexEvaluator; +use crate::expr::visitors::row_group_metrics_evaluator::RowGroupMetricsEvaluator; +use crate::spec::Schema; +use crate::{Error, ErrorKind}; + +impl ArrowReader { + pub(super) fn get_row_filter( + predicates: &BoundPredicate, + parquet_schema: &SchemaDescriptor, + iceberg_field_ids: &HashSet, + field_id_map: &HashMap, + ) -> Result { + // Collect Parquet column indices from field ids. + // If the field id is not found in Parquet schema, it will be ignored due to schema evolution. + let mut column_indices = iceberg_field_ids + .iter() + .filter_map(|field_id| field_id_map.get(field_id).cloned()) + .collect::>(); + column_indices.sort(); + + // The converter that converts `BoundPredicates` to `ArrowPredicates` + let mut converter = PredicateConverter { + parquet_schema, + column_map: field_id_map, + column_indices: &column_indices, + }; + + // After collecting required leaf column indices used in the predicate, + // creates the projection mask for the Arrow predicates. + let projection_mask = ProjectionMask::leaves(parquet_schema, column_indices.clone()); + let predicate_func = visit(&mut converter, predicates)?; + let arrow_predicate = ArrowPredicateFn::new(projection_mask, predicate_func); + Ok(RowFilter::new(vec![Box::new(arrow_predicate)])) + } + + pub(super) fn get_selected_row_group_indices( + predicate: &BoundPredicate, + parquet_metadata: &Arc, + field_id_map: &HashMap, + snapshot_schema: &Schema, + ) -> Result> { + let row_groups_metadata = parquet_metadata.row_groups(); + let mut results = Vec::with_capacity(row_groups_metadata.len()); + + for (idx, row_group_metadata) in row_groups_metadata.iter().enumerate() { + if RowGroupMetricsEvaluator::eval( + predicate, + row_group_metadata, + field_id_map, + snapshot_schema, + )? { + results.push(idx); + } + } + + Ok(results) + } + + pub(super) fn get_row_selection_for_filter_predicate( + predicate: &BoundPredicate, + parquet_metadata: &Arc, + selected_row_groups: &Option>, + field_id_map: &HashMap, + snapshot_schema: &Schema, + ) -> Result { + let Some(column_index) = parquet_metadata.column_index() else { + return Err(Error::new( + ErrorKind::Unexpected, + "Parquet file metadata does not contain a column index", + )); + }; + + let Some(offset_index) = parquet_metadata.offset_index() else { + return Err(Error::new( + ErrorKind::Unexpected, + "Parquet file metadata does not contain an offset index", + )); + }; + + // If all row groups were filtered out, return an empty RowSelection (select no rows) + if let Some(selected_row_groups) = selected_row_groups + && selected_row_groups.is_empty() + { + return Ok(RowSelection::from(Vec::new())); + } + + let mut selected_row_groups_idx = 0; + + let page_index = column_index + .iter() + .enumerate() + .zip(offset_index) + .zip(parquet_metadata.row_groups()); + + let mut results = Vec::new(); + for (((idx, column_index), offset_index), row_group_metadata) in page_index { + if let Some(selected_row_groups) = selected_row_groups { + // skip row groups that aren't present in selected_row_groups + if idx == selected_row_groups[selected_row_groups_idx] { + selected_row_groups_idx += 1; + } else { + continue; + } + } + + let selections_for_page = PageIndexEvaluator::eval( + predicate, + column_index, + offset_index, + row_group_metadata, + field_id_map, + snapshot_schema, + )?; + + results.push(selections_for_page); + + if let Some(selected_row_groups) = selected_row_groups + && selected_row_groups_idx == selected_row_groups.len() + { + break; + } + } + + Ok(results.into_iter().flatten().collect::>().into()) + } + + /// Filters row groups by byte range to support Iceberg's file splitting. + /// + /// Iceberg splits large files at row group boundaries, so we only read row groups + /// whose byte ranges overlap with [start, start+length). + pub(super) fn filter_row_groups_by_byte_range( + parquet_metadata: &Arc, + start: u64, + length: u64, + ) -> Result> { + let row_groups = parquet_metadata.row_groups(); + let mut selected = Vec::new(); + let end = start + length; + + // Row groups are stored sequentially after the 4-byte magic header. + let mut current_byte_offset = 4u64; + + for (idx, row_group) in row_groups.iter().enumerate() { + let row_group_size = row_group.compressed_size() as u64; + let row_group_end = current_byte_offset + row_group_size; + + if current_byte_offset < end && start < row_group_end { + selected.push(idx); + } + + current_byte_offset = row_group_end; + } + + Ok(selected) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::fs::File; + use std::sync::Arc; + + use arrow_array::cast::AsArray; + use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, StringArray}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use futures::TryStreamExt; + use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY}; + use parquet::basic::Compression; + use parquet::file::properties::WriterProperties; + use tempfile::TempDir; + + use crate::arrow::{ArrowReader, ArrowReaderBuilder}; + use crate::expr::{Bind, Predicate, Reference}; + use crate::io::FileIO; + use crate::scan::{FileScanTask, FileScanTaskStream}; + use crate::spec::{DataFileFormat, Datum, NestedField, PrimitiveType, Schema, SchemaRef, Type}; + + async fn test_perform_read( + predicate: Predicate, + schema: SchemaRef, + table_location: String, + reader: ArrowReader, + ) -> Vec> { + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1], + predicate: Some(predicate.bind(schema, true).unwrap()), + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + result[0].columns()[0] + .as_string_opt::() + .unwrap() + .iter() + .map(|v| v.map(ToOwned::to_owned)) + .collect::>() + } + + fn setup_kleene_logic( + data_for_col_a: Vec>, + col_a_type: DataType, + ) -> (FileIO, SchemaRef, String, TempDir) { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "a", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("a", col_a_type.clone(), true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + + let file_io = FileIO::new_with_fs(); + + let col = match col_a_type { + DataType::Utf8 => Arc::new(StringArray::from(data_for_col_a)) as ArrayRef, + DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data_for_col_a)) as ArrayRef, + _ => panic!("unexpected col_a_type"), + }; + + let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![col]).unwrap(); + + // Write the Parquet files + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = + ArrowWriter::try_new(file, to_write.schema(), Some(props.clone())).unwrap(); + + writer.write(&to_write).expect("Writing batch"); + + // writer must be closed to write footer + writer.close().unwrap(); + + (file_io, schema, table_location, tmp_dir) + } + + #[tokio::test] + async fn test_kleene_logic_or_behaviour() { + // a IS NULL OR a = 'foo' + let predicate = Reference::new("a") + .is_null() + .or(Reference::new("a").equal_to(Datum::string("foo"))); + + // Table data: [NULL, "foo", "bar"] + let data_for_col_a = vec![None, Some("foo".to_string()), Some("bar".to_string())]; + + // Expected: [NULL, "foo"]. + let expected = vec![None, Some("foo".to_string())]; + + let (file_io, schema, table_location, _temp_dir) = + setup_kleene_logic(data_for_col_a, DataType::Utf8); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let result_data = test_perform_read(predicate, schema, table_location, reader).await; + + assert_eq!(result_data, expected); + } + + #[tokio::test] + async fn test_kleene_logic_and_behaviour() { + // a IS NOT NULL AND a != 'foo' + let predicate = Reference::new("a") + .is_not_null() + .and(Reference::new("a").not_equal_to(Datum::string("foo"))); + + // Table data: [NULL, "foo", "bar"] + let data_for_col_a = vec![None, Some("foo".to_string()), Some("bar".to_string())]; + + // Expected: ["bar"]. + let expected = vec![Some("bar".to_string())]; + + let (file_io, schema, table_location, _temp_dir) = + setup_kleene_logic(data_for_col_a, DataType::Utf8); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let result_data = test_perform_read(predicate, schema, table_location, reader).await; + + assert_eq!(result_data, expected); + } + + #[tokio::test] + async fn test_predicate_cast_literal() { + let predicates = vec![ + // a == 'foo' + (Reference::new("a").equal_to(Datum::string("foo")), vec![ + Some("foo".to_string()), + ]), + // a != 'foo' + ( + Reference::new("a").not_equal_to(Datum::string("foo")), + vec![Some("bar".to_string())], + ), + // STARTS_WITH(a, 'foo') + (Reference::new("a").starts_with(Datum::string("f")), vec![ + Some("foo".to_string()), + ]), + // NOT STARTS_WITH(a, 'foo') + ( + Reference::new("a").not_starts_with(Datum::string("f")), + vec![Some("bar".to_string())], + ), + // a < 'foo' + (Reference::new("a").less_than(Datum::string("foo")), vec![ + Some("bar".to_string()), + ]), + // a <= 'foo' + ( + Reference::new("a").less_than_or_equal_to(Datum::string("foo")), + vec![Some("foo".to_string()), Some("bar".to_string())], + ), + // a > 'foo' + ( + Reference::new("a").greater_than(Datum::string("bar")), + vec![Some("foo".to_string())], + ), + // a >= 'foo' + ( + Reference::new("a").greater_than_or_equal_to(Datum::string("foo")), + vec![Some("foo".to_string())], + ), + // a IN ('foo', 'bar') + ( + Reference::new("a").is_in([Datum::string("foo"), Datum::string("baz")]), + vec![Some("foo".to_string())], + ), + // a NOT IN ('foo', 'bar') + ( + Reference::new("a").is_not_in([Datum::string("foo"), Datum::string("baz")]), + vec![Some("bar".to_string())], + ), + ]; + + // Table data: ["foo", "bar"] + let data_for_col_a = vec![Some("foo".to_string()), Some("bar".to_string())]; + + let (file_io, schema, table_location, _temp_dir) = + setup_kleene_logic(data_for_col_a, DataType::LargeUtf8); + let reader = ArrowReaderBuilder::new(file_io).build(); + + for (predicate, expected) in predicates { + println!("testing predicate {predicate}"); + let result_data = test_perform_read( + predicate.clone(), + schema.clone(), + table_location.clone(), + reader.clone(), + ) + .await; + + assert_eq!(result_data, expected, "predicate={predicate}"); + } + } + + /// Verifies that file splits respect byte ranges and only read specific row groups. + #[tokio::test] + async fn test_file_splits_respect_byte_ranges() { + use arrow_array::Int32Array; + use parquet::file::reader::{FileReader, SerializedFileReader}; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/multi_row_group.parquet"); + + // Force each batch into its own row group for testing byte range filtering. + let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( + (0..100).collect::>(), + ))]) + .unwrap(); + let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( + (100..200).collect::>(), + ))]) + .unwrap(); + let batch3 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( + (200..300).collect::>(), + ))]) + .unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .set_max_row_group_row_count(Some(100)) + .build(); + + let file = File::create(&file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); + writer.write(&batch1).expect("Writing batch 1"); + writer.write(&batch2).expect("Writing batch 2"); + writer.write(&batch3).expect("Writing batch 3"); + writer.close().unwrap(); + + // Read the file metadata to get row group byte positions + let file = File::open(&file_path).unwrap(); + let reader = SerializedFileReader::new(file).unwrap(); + let metadata = reader.metadata(); + + println!("File has {} row groups", metadata.num_row_groups()); + assert_eq!(metadata.num_row_groups(), 3, "Expected 3 row groups"); + + // Get byte positions for each row group + let row_group_0 = metadata.row_group(0); + let row_group_1 = metadata.row_group(1); + let row_group_2 = metadata.row_group(2); + + let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" + let rg1_start = rg0_start + row_group_0.compressed_size() as u64; + let rg2_start = rg1_start + row_group_1.compressed_size() as u64; + let file_end = rg2_start + row_group_2.compressed_size() as u64; + + println!( + "Row group 0: {} rows, starts at byte {}, {} bytes compressed", + row_group_0.num_rows(), + rg0_start, + row_group_0.compressed_size() + ); + println!( + "Row group 1: {} rows, starts at byte {}, {} bytes compressed", + row_group_1.num_rows(), + rg1_start, + row_group_1.compressed_size() + ); + println!( + "Row group 2: {} rows, starts at byte {}, {} bytes compressed", + row_group_2.num_rows(), + rg2_start, + row_group_2.compressed_size() + ); + + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + // Task 1: read only the first row group + let task1 = FileScanTask { + file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), + start: rg0_start, + length: row_group_0.compressed_size() as u64, + record_count: Some(100), + data_file_path: file_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + // Task 2: read the second and third row groups + let task2 = FileScanTask { + file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), + start: rg1_start, + length: file_end - rg1_start, + record_count: Some(200), + data_file_path: file_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks1 = Box::pin(futures::stream::iter(vec![Ok(task1)])) as FileScanTaskStream; + let result1 = reader + .clone() + .read(tasks1) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + let total_rows_task1: usize = result1.iter().map(|b| b.num_rows()).sum(); + println!( + "Task 1 (bytes {}-{}) returned {} rows", + rg0_start, + rg0_start + row_group_0.compressed_size() as u64, + total_rows_task1 + ); + + let tasks2 = Box::pin(futures::stream::iter(vec![Ok(task2)])) as FileScanTaskStream; + let result2 = reader + .read(tasks2) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + let total_rows_task2: usize = result2.iter().map(|b| b.num_rows()).sum(); + println!("Task 2 (bytes {rg1_start}-{file_end}) returned {total_rows_task2} rows"); + + assert_eq!( + total_rows_task1, 100, + "Task 1 should read only the first row group (100 rows), but got {total_rows_task1} rows" + ); + + assert_eq!( + total_rows_task2, 200, + "Task 2 should read only the second+third row groups (200 rows), but got {total_rows_task2} rows" + ); + + // Verify the actual data values are correct (not just the row count) + if total_rows_task1 > 0 { + let first_batch = &result1[0]; + let id_col = first_batch + .column(0) + .as_primitive::(); + let first_val = id_col.value(0); + let last_val = id_col.value(id_col.len() - 1); + println!("Task 1 data range: {first_val} to {last_val}"); + + assert_eq!(first_val, 0, "Task 1 should start with id=0"); + assert_eq!(last_val, 99, "Task 1 should end with id=99"); + } + + if total_rows_task2 > 0 { + let first_batch = &result2[0]; + let id_col = first_batch + .column(0) + .as_primitive::(); + let first_val = id_col.value(0); + println!("Task 2 first value: {first_val}"); + + assert_eq!(first_val, 100, "Task 2 should start with id=100, not id=0"); + } + } +} diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index bd9e249f48..9b504421ae 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -199,7 +199,10 @@ fn visit_struct(fields: &Fields, visitor: &mut V) -> Resu } /// Visit schema in post order. -fn visit_schema(schema: &ArrowSchema, visitor: &mut V) -> Result { +pub(crate) fn visit_schema( + schema: &ArrowSchema, + visitor: &mut V, +) -> Result { let mut results = Vec::with_capacity(schema.fields().len()); for field in schema.fields() { visitor.before_field(field)?; @@ -759,6 +762,11 @@ pub(crate) fn get_arrow_datum(datum: &Datum) -> Result { + let array = FixedSizeBinaryArray::try_from_iter(std::iter::once(value.as_slice())) + .map_err(|e| Error::new(ErrorKind::DataInvalid, e.to_string()))?; + Ok(Arc::new(Scalar::new(array))) + } (primitive_type, _) => Err(Error::new( ErrorKind::FeatureUnsupported, @@ -2151,6 +2159,18 @@ mod tests { assert!(is_scalar); assert_eq!(array.value(0), [66u8; 16]); } + { + let datum = Datum::fixed(vec![1u8, 2, 3, 4, 5, 6, 7, 8]); + let arrow_datum = get_arrow_datum(&datum).unwrap(); + let (array, is_scalar) = arrow_datum.get(); + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + assert!(is_scalar); + assert_eq!(array.value_length(), 8); + assert_eq!(array.value(0), &[1u8, 2, 3, 4, 5, 6, 7, 8]); + } } #[test] diff --git a/crates/iceberg/src/catalog/metadata_location.rs b/crates/iceberg/src/catalog/metadata_location.rs index ed28118879..acd041d5e1 100644 --- a/crates/iceberg/src/catalog/metadata_location.rs +++ b/crates/iceberg/src/catalog/metadata_location.rs @@ -114,9 +114,9 @@ impl MetadataLocation { ))?; // Check for compression suffix (e.g., .gz) - let gzip_suffix = CompressionCodec::Gzip.suffix()?; + let gzip_suffix = CompressionCodec::gzip_default().suffix()?; let (stripped, compression_codec) = if let Some(s) = stripped.strip_suffix(gzip_suffix) { - (s, CompressionCodec::Gzip) + (s, CompressionCodec::gzip_default()) } else { (stripped, CompressionCodec::None) }; @@ -261,7 +261,7 @@ mod test { table_location: "/abc".to_string(), version: 1234567, id: Uuid::from_str("2cd22b57-5127-4198-92ba-e4e67c79821b").unwrap(), - compression_codec: CompressionCodec::Gzip, + compression_codec: CompressionCodec::gzip_default(), }), ), // Negative version @@ -345,10 +345,16 @@ mod test { "/test/table/metadata/00005-81056704-ce5b-41c4-bb83-eb6408081af6.gz.metadata.json", ) .unwrap(); - assert_eq!(location_gzip.compression_codec, CompressionCodec::Gzip); + assert_eq!( + location_gzip.compression_codec, + CompressionCodec::gzip_default() + ); let next_gzip = location_gzip.with_next_version(); - assert_eq!(next_gzip.compression_codec, CompressionCodec::Gzip); + assert_eq!( + next_gzip.compression_codec, + CompressionCodec::gzip_default() + ); assert_eq!(next_gzip.version, 6); } @@ -369,7 +375,10 @@ mod test { ); let metadata_gzip = create_test_metadata(props_gzip); let updated_gzip = location.with_new_metadata(&metadata_gzip); - assert_eq!(updated_gzip.compression_codec, CompressionCodec::Gzip); + assert_eq!( + updated_gzip.compression_codec, + CompressionCodec::gzip_default() + ); assert_eq!(updated_gzip.version, 0); assert_eq!( updated_gzip.to_string(), diff --git a/crates/iceberg/src/catalog/mod.rs b/crates/iceberg/src/catalog/mod.rs index f296cf2260..43102adec9 100644 --- a/crates/iceberg/src/catalog/mod.rs +++ b/crates/iceberg/src/catalog/mod.rs @@ -144,7 +144,6 @@ pub trait CatalogBuilder: Default + Debug + Send + Sync { /// /// let catalog = MyCatalogBuilder::default() /// .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - /// configured_scheme: "s3a".to_string(), /// customized_credential_load: None, /// })) /// .load("my_catalog", props) diff --git a/crates/iceberg/src/compression.rs b/crates/iceberg/src/compression.rs index 42f5298437..929d9226e7 100644 --- a/crates/iceberg/src/compression.rs +++ b/crates/iceberg/src/compression.rs @@ -17,28 +17,101 @@ //! Compression codec support for data compression and decompression. +use std::fmt; use std::io::{Read, Write}; use flate2::Compression; use flate2::read::GzDecoder; use flate2::write::GzEncoder; -use serde::{Deserialize, Serialize}; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; use crate::{Error, ErrorKind, Result}; +/// Default compression level for Zstandard (zstd). +const ZSTD_DEFAULT_LEVEL: u8 = 3; +/// Default compression level for Gzip. +const GZIP_DEFAULT_LEVEL: u8 = 6; +/// Maximum compression level for Gzip. +const GZIP_MAX_LEVEL: u8 = 9; + /// Data compression formats -#[derive(Debug, PartialEq, Eq, Clone, Copy, Default, Serialize, Deserialize)] -#[serde(rename_all = "lowercase")] +#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)] pub enum CompressionCodec { #[default] /// No compression None, /// LZ4 single compression frame with content size present Lz4, - /// Zstandard single compression frame with content size present - Zstd, - /// Gzip compression - Gzip, + /// Zstandard single compression frame with content size present. + /// Level range is 0–22, where 0 means default compression level (not no compression). + /// Use [`CompressionCodec::zstd_default`] to construct with the default level. + Zstd(u8), + /// Gzip compression. Level range is 0–9, where 0 means no compression. + /// Use [`CompressionCodec::gzip_default`] to construct with the default level. + Gzip(u8), + /// Snappy compression + Snappy, +} + +impl CompressionCodec { + /// Returns a Zstd codec with the default compression level. + pub const fn zstd_default() -> Self { + CompressionCodec::Zstd(ZSTD_DEFAULT_LEVEL) + } + + /// Returns a Gzip codec with the default compression level. + pub const fn gzip_default() -> Self { + CompressionCodec::Gzip(GZIP_DEFAULT_LEVEL) + } + + /// Returns the codec name as used in serialization and error messages. + pub fn name(&self) -> &'static str { + match self { + CompressionCodec::None => "none", + CompressionCodec::Lz4 => "lz4", + CompressionCodec::Zstd(_) => "zstd", + CompressionCodec::Gzip(_) => "gzip", + CompressionCodec::Snappy => "snappy", + } + } +} + +// Note: serialize/deserialize do not round-trip the compression level. Iceberg configuration +// only the codec name (e.g. "zstd"), not the level, so deserialization always produces the +// default level. A `Zstd(5)` written to metadata will be read back as `Zstd(3)`. Some +// compression configuration (e.g. Avro metadata) has a separate level field alongside the codec name. +impl Serialize for CompressionCodec { + fn serialize(&self, serializer: S) -> std::result::Result { + serializer.serialize_str(self.name()) + } +} + +impl<'de> Deserialize<'de> for CompressionCodec { + fn deserialize>(deserializer: D) -> std::result::Result { + let s = String::deserialize(deserializer)?; + match s.to_lowercase().as_str() { + "none" => Ok(CompressionCodec::None), + "lz4" => Ok(CompressionCodec::Lz4), + "zstd" => Ok(CompressionCodec::zstd_default()), + "gzip" => Ok(CompressionCodec::gzip_default()), + "snappy" => Ok(CompressionCodec::Snappy), + other => Err(serde::de::Error::unknown_variant(other, &[ + "none", "lz4", "zstd", "gzip", "snappy", + ])), + } + } +} + +impl fmt::Display for CompressionCodec { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + CompressionCodec::None => write!(f, "None"), + CompressionCodec::Lz4 => write!(f, "Lz4"), + CompressionCodec::Zstd(level) => write!(f, "Zstd(level={level})"), + CompressionCodec::Gzip(level) => write!(f, "Gzip(level={level})"), + CompressionCodec::Snappy => write!(f, "Snappy"), + } + } } impl CompressionCodec { @@ -49,13 +122,17 @@ impl CompressionCodec { ErrorKind::FeatureUnsupported, "LZ4 decompression is not supported currently", )), - CompressionCodec::Zstd => Ok(zstd::stream::decode_all(&bytes[..])?), - CompressionCodec::Gzip => { + CompressionCodec::Zstd(_) => Ok(zstd::stream::decode_all(&bytes[..])?), + CompressionCodec::Gzip(_) => { let mut decoder = GzDecoder::new(&bytes[..]); let mut decompressed = Vec::new(); decoder.read_to_end(&mut decompressed)?; Ok(decompressed) } + CompressionCodec::Snappy => Err(Error::new( + ErrorKind::FeatureUnsupported, + "Snappy decompression is not supported currently", + )), } } @@ -66,19 +143,24 @@ impl CompressionCodec { ErrorKind::FeatureUnsupported, "LZ4 compression is not supported currently", )), - CompressionCodec::Zstd => { + CompressionCodec::Zstd(level) => { let writer = Vec::::new(); - let mut encoder = zstd::stream::Encoder::new(writer, 3)?; + let mut encoder = zstd::stream::Encoder::new(writer, *level as i32)?; encoder.include_checksum(true)?; encoder.set_pledged_src_size(Some(bytes.len().try_into()?))?; std::io::copy(&mut &bytes[..], &mut encoder)?; Ok(encoder.finish()?) } - CompressionCodec::Gzip => { - let mut encoder = GzEncoder::new(Vec::new(), Compression::default()); + CompressionCodec::Gzip(level) => { + let compression = Compression::new((*level).min(GZIP_MAX_LEVEL) as u32); + let mut encoder = GzEncoder::new(Vec::new(), compression); encoder.write_all(&bytes)?; Ok(encoder.finish()?) } + CompressionCodec::Snappy => Err(Error::new( + ErrorKind::FeatureUnsupported, + "Snappy compression is not supported currently", + )), } } @@ -95,8 +177,10 @@ impl CompressionCodec { pub fn suffix(&self) -> Result<&'static str> { match self { CompressionCodec::None => Ok(""), - CompressionCodec::Gzip => Ok(".gz"), - codec @ (CompressionCodec::Lz4 | CompressionCodec::Zstd) => Err(Error::new( + CompressionCodec::Gzip(_) => Ok(".gz"), + codec @ (CompressionCodec::Lz4 + | CompressionCodec::Zstd(_) + | CompressionCodec::Snappy) => Err(Error::new( ErrorKind::FeatureUnsupported, format!("suffix not defined for {codec:?}"), )), @@ -123,7 +207,10 @@ mod tests { async fn test_compression_codec_compress() { let bytes_vec = [0_u8; 100].to_vec(); - let compression_codecs = [CompressionCodec::Zstd, CompressionCodec::Gzip]; + let compression_codecs = [ + CompressionCodec::zstd_default(), + CompressionCodec::gzip_default(), + ]; for codec in compression_codecs { let compressed = codec.compress(bytes_vec.clone()).unwrap(); @@ -135,7 +222,10 @@ mod tests { #[tokio::test] async fn test_compression_codec_unsupported() { - let unsupported_codecs = [(CompressionCodec::Lz4, "LZ4")]; + let unsupported_codecs = [ + (CompressionCodec::Lz4, "LZ4"), + (CompressionCodec::Snappy, "Snappy"), + ]; let bytes_vec = [0_u8; 100].to_vec(); for (codec, name) in unsupported_codecs { @@ -153,18 +243,34 @@ mod tests { #[test] fn test_suffix() { - // Test supported codecs assert_eq!(CompressionCodec::None.suffix().unwrap(), ""); - assert_eq!(CompressionCodec::Gzip.suffix().unwrap(), ".gz"); + assert_eq!(CompressionCodec::gzip_default().suffix().unwrap(), ".gz"); - // Test unsupported codecs return errors assert!(CompressionCodec::Lz4.suffix().is_err()); - assert!(CompressionCodec::Zstd.suffix().is_err()); + assert!(CompressionCodec::zstd_default().suffix().is_err()); + assert!(CompressionCodec::Snappy.suffix().is_err()); let lz4_err = CompressionCodec::Lz4.suffix().unwrap_err(); assert!(lz4_err.to_string().contains("suffix not defined for Lz4")); - let zstd_err = CompressionCodec::Zstd.suffix().unwrap_err(); + let zstd_err = CompressionCodec::zstd_default().suffix().unwrap_err(); assert!(zstd_err.to_string().contains("suffix not defined for Zstd")); } + + #[test] + fn test_display() { + assert_eq!(CompressionCodec::None.to_string(), "None"); + assert_eq!(CompressionCodec::Lz4.to_string(), "Lz4"); + assert_eq!( + CompressionCodec::zstd_default().to_string(), + "Zstd(level=3)" + ); + assert_eq!(CompressionCodec::Zstd(5).to_string(), "Zstd(level=5)"); + assert_eq!( + CompressionCodec::gzip_default().to_string(), + "Gzip(level=6)" + ); + assert_eq!(CompressionCodec::Gzip(9).to_string(), "Gzip(level=9)"); + assert_eq!(CompressionCodec::Snappy.to_string(), "Snappy"); + } } diff --git a/crates/iceberg/src/encryption/file_decryptor.rs b/crates/iceberg/src/encryption/file_decryptor.rs new file mode 100644 index 0000000000..e44c0e1d78 --- /dev/null +++ b/crates/iceberg/src/encryption/file_decryptor.rs @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! File-level decryption helper for AGS1 stream-encrypted files. + +use std::fmt; +use std::sync::Arc; + +use super::crypto::{AesGcmCipher, SecureKey}; +use super::stream::AesGcmFileRead; +use crate::Result; +use crate::io::FileRead; + +/// Holds the decryption material for a single encrypted file. +/// +/// Created from a plaintext DEK and AAD prefix, then used to wrap +/// an encrypted file reader for transparent decryption on read. +pub struct AesGcmFileDecryptor { + cipher: Arc, + aad_prefix: Box<[u8]>, +} + +impl AesGcmFileDecryptor { + /// Creates a new `AesGcmFileDecryptor` from a plaintext DEK and AAD prefix. + pub fn new(dek: &[u8], aad_prefix: impl Into>) -> Result { + let key = SecureKey::new(dek)?; + let cipher = Arc::new(AesGcmCipher::new(key)); + Ok(Self { + cipher, + aad_prefix: aad_prefix.into(), + }) + } + + /// Wraps a raw encrypted-file reader in a decrypting [`AesGcmFileRead`]. + pub fn wrap_reader( + &self, + reader: Box, + encrypted_file_length: u64, + ) -> Result> { + let decrypting = AesGcmFileRead::new( + reader, + Arc::clone(&self.cipher), + self.aad_prefix.clone(), + encrypted_file_length, + )?; + Ok(Box::new(decrypting)) + } + + /// Calculates the plaintext length from an encrypted file's total length. + pub fn plaintext_length(&self, encrypted_file_length: u64) -> Result { + AesGcmFileRead::calculate_plaintext_length(encrypted_file_length) + } +} + +impl fmt::Debug for AesGcmFileDecryptor { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("AesGcmFileDecryptor") + .field("aad_prefix_len", &self.aad_prefix.len()) + .finish_non_exhaustive() + } +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use bytes::Bytes; + + use super::*; + use crate::encryption::AesGcmFileEncryptor; + use crate::io::FileWrite; + + struct MemoryFileRead(Bytes); + + #[async_trait::async_trait] + impl FileRead for MemoryFileRead { + async fn read(&self, range: Range) -> Result { + Ok(self.0.slice(range.start as usize..range.end as usize)) + } + } + + struct MemoryFileWrite { + buffer: std::sync::Arc>>, + } + + #[async_trait::async_trait] + impl FileWrite for MemoryFileWrite { + async fn write(&mut self, bs: Bytes) -> Result<()> { + self.buffer.lock().unwrap().extend_from_slice(&bs); + Ok(()) + } + + async fn close(&mut self) -> Result<()> { + Ok(()) + } + } + + #[tokio::test] + async fn test_wrap_reader_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"Hello from file decryptor!"; + + // Encrypt via the encryptor wrapper + let encryptor = AesGcmFileEncryptor::new(key.as_slice(), aad_prefix.as_slice()).unwrap(); + let buffer = std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let mut writer = encryptor.wrap_writer(Box::new(MemoryFileWrite { + buffer: buffer.clone(), + })); + writer.write(Bytes::from(plaintext.to_vec())).await.unwrap(); + writer.close().await.unwrap(); + let encrypted = buffer.lock().unwrap().clone(); + let encrypted_len = encrypted.len() as u64; + + // Decrypt via the decryptor wrapper + let decryptor = AesGcmFileDecryptor::new(key.as_slice(), aad_prefix.as_slice()).unwrap(); + let reader = decryptor + .wrap_reader( + Box::new(MemoryFileRead(Bytes::from(encrypted))), + encrypted_len, + ) + .unwrap(); + + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], plaintext); + } + + #[tokio::test] + async fn test_invalid_key_length() { + let result = AesGcmFileDecryptor::new(b"too-short", b"aad".as_slice()); + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_plaintext_length() { + let decryptor = AesGcmFileDecryptor::new(b"0123456789abcdef", b"aad".as_slice()).unwrap(); + // header(8) + nonce(12) + 10 bytes ciphertext + tag(16) = 46 + let encrypted_len = 8 + 12 + 10 + 16; + let plain_len = decryptor.plaintext_length(encrypted_len).unwrap(); + assert_eq!(plain_len, 10); + } +} diff --git a/crates/iceberg/src/encryption/file_encryptor.rs b/crates/iceberg/src/encryption/file_encryptor.rs new file mode 100644 index 0000000000..773438ad80 --- /dev/null +++ b/crates/iceberg/src/encryption/file_encryptor.rs @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! File-level encryption helper for AGS1 stream-encrypted files. + +use std::fmt; +use std::sync::Arc; + +use super::crypto::{AesGcmCipher, SecureKey}; +use super::stream::AesGcmFileWrite; +use crate::Result; +use crate::io::FileWrite; + +/// Holds the encryption material for a single encrypted file. +/// +/// This is the write-side counterpart to +/// [`AesGcmFileDecryptor`](super::AesGcmFileDecryptor). Created from +/// a plaintext DEK and AAD prefix, then used to wrap an output writer +/// for transparent encryption on write. +pub struct AesGcmFileEncryptor { + cipher: Arc, + aad_prefix: Box<[u8]>, +} + +impl AesGcmFileEncryptor { + /// Creates a new `AesGcmFileEncryptor` from a plaintext DEK and AAD prefix. + pub fn new(dek: &[u8], aad_prefix: impl Into>) -> Result { + let key = SecureKey::new(dek)?; + let cipher = Arc::new(AesGcmCipher::new(key)); + Ok(Self { + cipher, + aad_prefix: aad_prefix.into(), + }) + } + + /// Wraps a raw output writer in an encrypting [`AesGcmFileWrite`]. + pub fn wrap_writer(&self, writer: Box) -> Box { + Box::new(AesGcmFileWrite::new( + writer, + Arc::clone(&self.cipher), + self.aad_prefix.clone(), + )) + } +} + +impl fmt::Debug for AesGcmFileEncryptor { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("AesGcmFileEncryptor") + .field("aad_prefix_len", &self.aad_prefix.len()) + .finish_non_exhaustive() + } +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use bytes::Bytes; + + use super::*; + use crate::encryption::AesGcmFileDecryptor; + use crate::io::FileRead; + + struct MemoryFileRead(Bytes); + + #[async_trait::async_trait] + impl FileRead for MemoryFileRead { + async fn read(&self, range: Range) -> Result { + Ok(self.0.slice(range.start as usize..range.end as usize)) + } + } + + struct MemoryFileWrite { + buffer: std::sync::Arc>>, + } + + #[async_trait::async_trait] + impl FileWrite for MemoryFileWrite { + async fn write(&mut self, bs: Bytes) -> Result<()> { + self.buffer.lock().unwrap().extend_from_slice(&bs); + Ok(()) + } + + async fn close(&mut self) -> Result<()> { + Ok(()) + } + } + + #[tokio::test] + async fn test_wrap_writer_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"Hello from file encryptor!"; + + // Encrypt via the encryptor wrapper + let encryptor = AesGcmFileEncryptor::new(key.as_slice(), aad_prefix.as_slice()).unwrap(); + let buffer = std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let mut writer = encryptor.wrap_writer(Box::new(MemoryFileWrite { + buffer: buffer.clone(), + })); + writer.write(Bytes::from(plaintext.to_vec())).await.unwrap(); + writer.close().await.unwrap(); + let encrypted = buffer.lock().unwrap().clone(); + let encrypted_len = encrypted.len() as u64; + + // Decrypt via the decryptor wrapper + let decryptor = AesGcmFileDecryptor::new(key.as_slice(), aad_prefix.as_slice()).unwrap(); + let reader = decryptor + .wrap_reader( + Box::new(MemoryFileRead(Bytes::from(encrypted))), + encrypted_len, + ) + .unwrap(); + + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], plaintext); + } + + #[tokio::test] + async fn test_invalid_key_length() { + let result = AesGcmFileEncryptor::new(b"bad-key", b"aad".as_slice()); + assert!(result.is_err()); + } +} diff --git a/crates/iceberg/src/encryption/mod.rs b/crates/iceberg/src/encryption/mod.rs index 097f4f24e3..9888a153c7 100644 --- a/crates/iceberg/src/encryption/mod.rs +++ b/crates/iceberg/src/encryption/mod.rs @@ -21,5 +21,11 @@ //! and decrypting data in Iceberg tables. mod crypto; +mod file_decryptor; +mod file_encryptor; +mod stream; pub use crypto::{AesGcmCipher, AesKeySize, SecureKey}; +pub use file_decryptor::AesGcmFileDecryptor; +pub use file_encryptor::AesGcmFileEncryptor; +pub use stream::{AesGcmFileRead, AesGcmFileWrite}; diff --git a/crates/iceberg/src/encryption/stream.rs b/crates/iceberg/src/encryption/stream.rs new file mode 100644 index 0000000000..130578f2b1 --- /dev/null +++ b/crates/iceberg/src/encryption/stream.rs @@ -0,0 +1,1249 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! AGS1 stream encryption/decryption for Iceberg. +//! +//! Implements the block-based AES-GCM stream format used by Iceberg for +//! encrypting manifest lists and manifest files. The format is +//! byte-compatible with Java's `AesGcmInputStream` / `AesGcmOutputStream`. +//! +//! # AGS1 File Format +//! +//! ```text +//! ┌─────────────────────────────────────────────┐ +//! │ Header (8 bytes) │ +//! │ Magic: "AGS1" (4 bytes, ASCII) │ +//! │ Plain block size: u32 LE (4 bytes) │ +//! │ Default: 1,048,576 (1 MiB) │ +//! ├─────────────────────────────────────────────┤ +//! │ Block 0 │ +//! │ Nonce (12 bytes) │ +//! │ Ciphertext (up to plain_block_size bytes) │ +//! │ GCM Tag (16 bytes) │ +//! ├─────────────────────────────────────────────┤ +//! │ Block 1..N (same structure) │ +//! ├─────────────────────────────────────────────┤ +//! │ Final block (may be shorter) │ +//! └─────────────────────────────────────────────┘ +//! ``` +//! +//! Each block's AAD is: `aad_prefix || block_index (4 bytes, LE)`. + +use std::ops::Range; +use std::sync::Arc; + +use bytes::{Bytes, BytesMut}; + +use super::AesGcmCipher; +use crate::io::{FileRead, FileWrite}; +use crate::{Error, ErrorKind, Result}; + +/// Default plaintext block size (1 MiB), matching Java's `Ciphers.PLAIN_BLOCK_SIZE`. +pub const PLAIN_BLOCK_SIZE: u32 = 1024 * 1024; + +/// AES-GCM nonce length in bytes. +pub const NONCE_LENGTH: u32 = 12; + +/// AES-GCM authentication tag length in bytes. +pub const GCM_TAG_LENGTH: u32 = 16; + +/// Cipher block size = plaintext block size + nonce + GCM tag. +pub const CIPHER_BLOCK_SIZE: u32 = PLAIN_BLOCK_SIZE + NONCE_LENGTH + GCM_TAG_LENGTH; + +/// AGS1 stream magic bytes. +pub const GCM_STREAM_MAGIC: [u8; 4] = *b"AGS1"; + +/// AGS1 stream header length (4-byte magic + 4-byte block size). +pub const GCM_STREAM_HEADER_LENGTH: u32 = 8; + +/// Minimum valid AGS1 stream length (header + one empty block). +#[cfg(test)] +pub const MIN_STREAM_LENGTH: u32 = GCM_STREAM_HEADER_LENGTH + NONCE_LENGTH + GCM_TAG_LENGTH; + +/// Constructs the per-block AAD for AGS1 stream encryption. +/// +/// Format: `aad_prefix || block_index (4 bytes, little-endian)` +/// +/// This matches Java's `Ciphers.streamBlockAAD()`. +pub(crate) fn stream_block_aad(aad_prefix: &[u8], block_index: u32) -> Vec { + let index_bytes = block_index.to_le_bytes(); + if aad_prefix.is_empty() { + index_bytes.to_vec() + } else { + let mut aad = Vec::with_capacity(aad_prefix.len() + 4); + aad.extend_from_slice(aad_prefix); + aad.extend_from_slice(&index_bytes); + aad + } +} + +/// Transparent decryption of AGS1 stream-encrypted files. +/// +/// Implements the [`FileRead`] trait, providing random-access reads over +/// encrypted data. Each `read()` call determines which encrypted blocks +/// overlap the requested plaintext range, reads and decrypts them, then +/// returns the requested plaintext bytes. +/// +/// # Usage +/// +/// ```ignore +/// // (ignored: requires async runtime and concrete FileRead/FileWrite impls) +/// let reader = AesGcmFileRead::new( +/// inner_reader, // Box for the encrypted file +/// cipher, // Arc with the DEK +/// aad_prefix.to_vec(), +/// encrypted_file_length, +/// )?; +/// +/// // Read plaintext bytes transparently +/// let plaintext = reader.read(0..1024).await?; +/// ``` +pub struct AesGcmFileRead { + /// The underlying encrypted file reader. + inner: Box, + /// The AES-GCM cipher holding the DEK. + cipher: Arc, + /// AAD prefix from the key metadata. + aad_prefix: Box<[u8]>, + /// Total plaintext stream size in bytes. + plain_stream_size: u64, + /// Total number of encrypted blocks. + num_blocks: u64, + /// Size of the last cipher block (may be smaller than `CIPHER_BLOCK_SIZE`). + last_cipher_block_size: u32, +} + +impl AesGcmFileRead { + /// Creates a new `AesGcmFileRead` for decrypting an AGS1 stream. + /// + /// Computes the plaintext size and block layout from the encrypted file + /// length. No I/O is performed; header validation happens implicitly + /// when blocks are decrypted (GCM authentication will fail on corrupt data). + /// + /// # Arguments + /// + /// * `inner` - Reader for the underlying encrypted file + /// * `cipher` - AES-GCM cipher initialized with the file's DEK + /// * `aad_prefix` - AAD prefix from the file's `StandardKeyMetadata` + /// * `encrypted_file_length` - Total byte length of the encrypted file + pub fn new( + inner: Box, + cipher: Arc, + aad_prefix: Box<[u8]>, + encrypted_file_length: u64, + ) -> Result { + let plain_stream_size = Self::calculate_plaintext_length(encrypted_file_length)?; + let stream_length = encrypted_file_length - GCM_STREAM_HEADER_LENGTH as u64; + + if stream_length == 0 { + return Ok(Self { + inner, + cipher, + aad_prefix, + plain_stream_size: 0, + num_blocks: 0, + last_cipher_block_size: 0, + }); + } + + let num_full_blocks = stream_length / CIPHER_BLOCK_SIZE as u64; + let cipher_bytes_in_last_block = (stream_length % CIPHER_BLOCK_SIZE as u64) as u32; + let full_blocks_only = cipher_bytes_in_last_block == 0; + + let num_blocks = if full_blocks_only { + num_full_blocks + } else { + num_full_blocks + 1 + }; + + if num_blocks > u32::MAX as u64 { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "AGS1 format supports at most {} blocks (~4 TiB per file), but file requires {num_blocks} blocks", + u32::MAX + ), + )); + } + + let last_cipher_block_size = if full_blocks_only { + CIPHER_BLOCK_SIZE + } else { + cipher_bytes_in_last_block + }; + + Ok(Self { + inner, + cipher, + aad_prefix, + plain_stream_size, + num_blocks, + last_cipher_block_size, + }) + } + + /// Returns the plaintext stream size in bytes. + pub fn plaintext_length(&self) -> u64 { + self.plain_stream_size + } + + /// Calculates the plaintext length from an encrypted file's total length. + /// + /// This is a static calculation matching Java's + /// `AesGcmInputStream.calculatePlaintextLength()`. + pub fn calculate_plaintext_length(encrypted_file_length: u64) -> Result { + if encrypted_file_length < GCM_STREAM_HEADER_LENGTH as u64 { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Encrypted file too short: {encrypted_file_length} bytes (minimum {GCM_STREAM_HEADER_LENGTH})" + ), + )); + } + + let stream_length = encrypted_file_length - GCM_STREAM_HEADER_LENGTH as u64; + + if stream_length == 0 { + return Ok(0); + } + + let num_full_blocks = stream_length / CIPHER_BLOCK_SIZE as u64; + let cipher_bytes_in_last_block = stream_length % CIPHER_BLOCK_SIZE as u64; + let full_blocks_only = cipher_bytes_in_last_block == 0; + + let plain_bytes_in_last_block = if full_blocks_only { + 0 + } else { + if cipher_bytes_in_last_block < (NONCE_LENGTH + GCM_TAG_LENGTH) as u64 { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Truncated encrypted file: last block is {} bytes (minimum {})", + cipher_bytes_in_last_block, + NONCE_LENGTH + GCM_TAG_LENGTH + ), + )); + } + cipher_bytes_in_last_block - NONCE_LENGTH as u64 - GCM_TAG_LENGTH as u64 + }; + + Ok(num_full_blocks * PLAIN_BLOCK_SIZE as u64 + plain_bytes_in_last_block) + } + + /// Returns the encrypted byte offset for a given block index. + fn encrypted_block_offset(block_index: u64) -> u64 { + block_index * CIPHER_BLOCK_SIZE as u64 + GCM_STREAM_HEADER_LENGTH as u64 + } + + /// Returns the cipher block size for a given block index. + fn cipher_block_size(&self, block_index: u64) -> u32 { + if block_index == self.num_blocks - 1 { + self.last_cipher_block_size + } else { + CIPHER_BLOCK_SIZE + } + } +} + +#[async_trait::async_trait] +impl FileRead for AesGcmFileRead { + /// Reads and decrypts a plaintext byte range from the encrypted AGS1 stream. + /// + /// The caller specifies a range in **plaintext** coordinates (e.g. "bytes 0..1024 + /// of the original file"). This method translates that into the encrypted file + /// layout and performs the following steps: + /// + /// 1. **Map to blocks** — divides the plaintext range by `PLAIN_BLOCK_SIZE` to + /// find which encrypted blocks (`first_block..=last_block`) contain the + /// requested data. + /// + /// 2. **Single I/O read** — calculates the contiguous byte range in the + /// encrypted file that covers all needed blocks (including the 8-byte AGS1 + /// header offset, 12-byte nonces, and 16-byte GCM tags) and fetches them in + /// one call to the inner `FileRead`. + /// + /// 3. **Decrypt per block** — iterates over each cipher block in the response, + /// decrypts it with AES-GCM using the per-block AAD (`aad_prefix || block_index`), + /// and slices out only the plaintext bytes that overlap the requested range. + /// + /// 4. **Assemble result** — concatenates the slices into a single `Bytes` buffer + /// matching exactly `range.end - range.start` bytes. + /// + /// Because each block is independently encrypted with its own nonce and AAD, + /// arbitrary random-access reads are supported without decrypting the entire + /// file. GCM authentication is verified per-block, so any tampering is detected + /// at the granularity of individual blocks. + async fn read(&self, range: Range) -> Result { + if range.start == range.end { + return Ok(Bytes::new()); + } + + if range.start > range.end { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Invalid read range: start ({}) is greater than end ({})", + range.start, range.end + ), + )); + } + + if range.end > self.plain_stream_size { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Read range {}..{} exceeds plaintext size {}", + range.start, range.end, self.plain_stream_size + ), + )); + } + + if self.num_blocks == 0 { + return Ok(Bytes::new()); + } + + let first_block = range.start / PLAIN_BLOCK_SIZE as u64; + let last_block = (range.end - 1) / PLAIN_BLOCK_SIZE as u64; + + // Read all needed encrypted blocks in a single I/O call + let encrypted_start = Self::encrypted_block_offset(first_block); + let encrypted_end = + Self::encrypted_block_offset(last_block) + self.cipher_block_size(last_block) as u64; + + let all_encrypted = self.inner.read(encrypted_start..encrypted_end).await?; + + // Decrypt each block and extract the requested plaintext range + let result_len = (range.end - range.start) as usize; + let mut result = BytesMut::with_capacity(result_len); + let mut encrypted_offset = 0usize; + + for block_idx in first_block..=last_block { + let block_size = self.cipher_block_size(block_idx) as usize; + let cipher_block = &all_encrypted[encrypted_offset..encrypted_offset + block_size]; + encrypted_offset += block_size; + + let aad = stream_block_aad(&self.aad_prefix, block_idx as u32); + let decrypted = self.cipher.decrypt(cipher_block, Some(&aad))?; + + // Calculate which slice of this decrypted block we need + let block_plain_start = block_idx * PLAIN_BLOCK_SIZE as u64; + let slice_start = if block_idx == first_block { + (range.start - block_plain_start) as usize + } else { + 0 + }; + let slice_end = if block_idx == last_block { + (range.end - block_plain_start) as usize + } else { + decrypted.len() + }; + + result.extend_from_slice(&decrypted[slice_start..slice_end]); + } + + Ok(result.freeze()) + } +} + +/// Transparent encryption of AGS1 stream-encrypted files. +/// +/// Implements the [`FileWrite`] trait, buffering plaintext and emitting +/// encrypted AGS1 blocks. This is the streaming write counterpart to +/// [`AesGcmFileRead`]. +/// +/// # Usage +/// +/// ```ignore +/// // (ignored: requires async runtime and concrete FileRead/FileWrite impls) +/// let writer = AesGcmFileWrite::new( +/// inner_writer, // Box for the output file +/// cipher, // Arc with the DEK +/// aad_prefix.to_vec(), +/// ); +/// +/// writer.write(plaintext_chunk).await?; +/// writer.close().await?; +/// ``` +pub struct AesGcmFileWrite { + /// The underlying output writer. + inner: Box, + /// The AES-GCM cipher holding the DEK. + cipher: Arc, + /// AAD prefix from the key metadata. + aad_prefix: Box<[u8]>, + /// Plaintext buffer accumulating data before block encryption. + buffer: Vec, + /// Current block index for AAD construction. + block_index: u32, + /// Whether the AGS1 header has been written. + header_written: bool, + /// Whether close() has been called. + closed: bool, + /// Whether the writer is in a poisoned state due to a failed inner write. + /// Once poisoned, all subsequent operations are rejected because the inner + /// writer may have received partial data. + poisoned: bool, +} + +impl AesGcmFileWrite { + /// Creates a new `AesGcmFileWrite` for encrypting to AGS1 format. + /// + /// No I/O is performed until `write()` or `close()` is called. + pub fn new( + inner: Box, + cipher: Arc, + aad_prefix: impl Into>, + ) -> Self { + Self { + inner, + cipher, + aad_prefix: aad_prefix.into(), + buffer: Vec::new(), + block_index: 0, + header_written: false, + closed: false, + poisoned: false, + } + } + + /// Writes the AGS1 header (magic + plain block size) to the inner writer. + async fn write_header(&mut self) -> Result<()> { + let mut header = Vec::with_capacity(GCM_STREAM_HEADER_LENGTH as usize); + header.extend_from_slice(&GCM_STREAM_MAGIC); + header.extend_from_slice(&PLAIN_BLOCK_SIZE.to_le_bytes()); + if let Err(e) = self.inner.write(Bytes::from(header)).await { + self.poisoned = true; + return Err(e); + } + self.header_written = true; + Ok(()) + } + + /// Encrypts a plaintext block and writes it to the inner writer. + async fn encrypt_and_write_block(&mut self, block_data: &[u8]) -> Result<()> { + let aad = stream_block_aad(&self.aad_prefix, self.block_index); + let encrypted = self.cipher.encrypt(block_data, Some(&aad))?; + if let Err(e) = self.inner.write(Bytes::from(encrypted)).await { + self.poisoned = true; + return Err(e); + } + self.block_index = self.block_index.checked_add(1).ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "AGS1 block index overflow: file exceeds the maximum supported size (~4 TiB)", + ) + })?; + Ok(()) + } + + /// Encrypts the first `PLAIN_BLOCK_SIZE` bytes of the buffer in-place + /// and drains them, avoiding a 1 MiB temporary copy. + async fn encrypt_and_drain_block(&mut self) -> Result<()> { + let aad = stream_block_aad(&self.aad_prefix, self.block_index); + let encrypted = self + .cipher + .encrypt(&self.buffer[..PLAIN_BLOCK_SIZE as usize], Some(&aad))?; + if let Err(e) = self.inner.write(Bytes::from(encrypted)).await { + self.poisoned = true; + return Err(e); + } + self.block_index = self.block_index.checked_add(1).ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "AGS1 block index overflow: file exceeds the maximum supported size (~4 TiB)", + ) + })?; + self.buffer.drain(..PLAIN_BLOCK_SIZE as usize); + Ok(()) + } +} + +#[async_trait::async_trait] +impl FileWrite for AesGcmFileWrite { + async fn write(&mut self, bs: Bytes) -> Result<()> { + if self.closed { + return Err(Error::new( + ErrorKind::Unexpected, + "Cannot write to a closed AesGcmFileWrite", + )); + } + if self.poisoned { + return Err(Error::new( + ErrorKind::Unexpected, + "AesGcmFileWrite is in a poisoned state due to a previous write failure", + )); + } + + if !self.header_written { + self.write_header().await?; + } + + self.buffer.extend_from_slice(&bs); + + // Flush full blocks + while self.buffer.len() >= PLAIN_BLOCK_SIZE as usize { + self.encrypt_and_drain_block().await?; + } + + Ok(()) + } + + async fn close(&mut self) -> Result<()> { + if self.closed { + return Err(Error::new( + ErrorKind::Unexpected, + "AesGcmFileWrite already closed", + )); + } + if self.poisoned { + return Err(Error::new( + ErrorKind::Unexpected, + "AesGcmFileWrite is in a poisoned state due to a previous write failure", + )); + } + + if !self.header_written { + self.write_header().await?; + } + + // Write the final block if there's remaining data, or if this is an empty file + // (block_index == 0). Skip writing a spurious empty block when the plaintext was + // exactly block-aligned (buffer empty, blocks already written). + if !self.buffer.is_empty() || self.block_index == 0 { + let final_block = std::mem::take(&mut self.buffer); + self.encrypt_and_write_block(&final_block).await?; + } + self.closed = true; + + self.inner.close().await + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Encrypts plaintext into AGS1 format for testing. + /// + /// Mirrors Java's `AesGcmOutputStream` behavior: + /// - Always writes header + at least one block (even for empty input) + /// - Full blocks are `PLAIN_BLOCK_SIZE` bytes; last block may be shorter + fn encrypt_ags1(plaintext: &[u8], cipher: &AesGcmCipher, aad_prefix: &[u8]) -> Vec { + let mut result = Vec::new(); + + // Write header: "AGS1" + PLAIN_BLOCK_SIZE (LE) + result.extend_from_slice(&GCM_STREAM_MAGIC); + result.extend_from_slice(&PLAIN_BLOCK_SIZE.to_le_bytes()); + + // Write blocks + let mut offset = 0; + let mut block_index = 0u32; + + loop { + let remaining = plaintext.len() - offset; + let block_size = std::cmp::min(remaining, PLAIN_BLOCK_SIZE as usize); + + // Block 0 is always written (even if empty); subsequent empty blocks are skipped + if block_size == 0 && block_index > 0 { + break; + } + + let block_data = &plaintext[offset..offset + block_size]; + let aad = stream_block_aad(aad_prefix, block_index); + let encrypted = cipher.encrypt(block_data, Some(&aad)).unwrap(); + result.extend_from_slice(&encrypted); + + offset += block_size; + block_index += 1; + + // A partial block is always the last + if block_size < PLAIN_BLOCK_SIZE as usize { + break; + } + } + + result + } + + /// Helper to create an AesGcmCipher from raw key bytes. + fn make_cipher(key: &[u8]) -> AesGcmCipher { + use super::super::SecureKey; + let secure_key = SecureKey::new(key).unwrap(); + AesGcmCipher::new(secure_key) + } + + /// Helper to create an in-memory FileRead from bytes. + fn memory_reader(data: Vec) -> Box { + Box::new(MemoryFileRead(Bytes::from(data))) + } + + /// Simple in-memory FileRead for tests. + struct MemoryFileRead(Bytes); + + #[async_trait::async_trait] + impl FileRead for MemoryFileRead { + async fn read(&self, range: Range) -> Result { + let start = range.start as usize; + let end = range.end as usize; + if end > self.0.len() { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Range {}..{} out of bounds for {} bytes", + start, + end, + self.0.len() + ), + )); + } + Ok(self.0.slice(start..end)) + } + } + + #[tokio::test] + async fn test_empty_file_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(b"", &cipher, aad_prefix); + + // Verify minimum length: header(8) + nonce(12) + tag(16) = 36 + assert_eq!(encrypted.len(), MIN_STREAM_LENGTH as usize); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), 0); + + // Reading empty range should return empty bytes + let result = reader.read(0..0).await.unwrap(); + assert!(result.is_empty()); + } + + #[tokio::test] + async fn test_small_file_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"Hello, Iceberg encryption!"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), plaintext.len() as u64); + + // Read entire file + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], plaintext); + } + + #[tokio::test] + async fn test_partial_read() { + let key = b"0123456789abcdef"; + let aad_prefix = b"aad-prefix-here!"; + let plaintext = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + // Read a slice from the middle + let result = reader.read(10..20).await.unwrap(); + assert_eq!(&result[..], &plaintext[10..20]); + + // Read first byte + let result = reader.read(0..1).await.unwrap(); + assert_eq!(&result[..], &plaintext[0..1]); + + // Read last byte + let last = plaintext.len() as u64; + let result = reader.read(last - 1..last).await.unwrap(); + assert_eq!(&result[..], &plaintext[plaintext.len() - 1..]); + } + + #[tokio::test] + async fn test_multi_block_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"multi-block-aad!"; + + // 1.5 blocks of data + let size = PLAIN_BLOCK_SIZE as usize + PLAIN_BLOCK_SIZE as usize / 2; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(&plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), plaintext.len() as u64); + + // Read entire file + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_cross_block_read() { + let key = b"0123456789abcdef"; + let aad_prefix = b"cross-block-aad!"; + + // 2.5 blocks of data + let size = PLAIN_BLOCK_SIZE as usize * 2 + PLAIN_BLOCK_SIZE as usize / 2; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(&plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + // Read across block boundary (last 100 bytes of block 0 + first 100 bytes of block 1) + let boundary = PLAIN_BLOCK_SIZE as u64; + let result = reader.read(boundary - 100..boundary + 100).await.unwrap(); + assert_eq!( + &result[..], + &plaintext[(boundary - 100) as usize..(boundary + 100) as usize] + ); + + // Read across two block boundaries (spans blocks 0, 1, and 2) + let result = reader.read(boundary - 50..boundary * 2 + 50).await.unwrap(); + assert_eq!( + &result[..], + &plaintext[(boundary - 50) as usize..(boundary * 2 + 50) as usize] + ); + } + + #[tokio::test] + async fn test_exact_block_size() { + let key = b"0123456789abcdef"; + let aad_prefix = b"exact-block-aad!"; + + // Exactly 1 block + let plaintext: Vec = (0..PLAIN_BLOCK_SIZE as usize) + .map(|i| (i % 256) as u8) + .collect(); + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(&plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), PLAIN_BLOCK_SIZE as u64); + + let result = reader.read(0..PLAIN_BLOCK_SIZE as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_block_size_plus_one() { + let key = b"0123456789abcdef"; + let aad_prefix = b"block-plus-one!!"; + + // 1 block + 1 byte + let size = PLAIN_BLOCK_SIZE as usize + 1; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(&plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), size as u64); + + // Read the last byte (in block 1) + let result = reader.read(size as u64 - 1..size as u64).await.unwrap(); + assert_eq!(result[0], plaintext[size - 1]); + + // Read all + let result = reader.read(0..size as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_block_size_minus_one() { + let key = b"0123456789abcdef"; + let aad_prefix = b"block-minus-one!"; + + // 1 block - 1 byte + let size = PLAIN_BLOCK_SIZE as usize - 1; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(&plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), size as u64); + + let result = reader.read(0..size as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_wrong_aad_fails() { + let key = b"0123456789abcdef"; + let aad_prefix = b"correct-aad-here"; + let plaintext = b"sensitive data here"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(plaintext, &cipher, aad_prefix); + + // Try to decrypt with wrong AAD + let mut bad_aad = aad_prefix.to_vec(); + bad_aad[0] ^= 0xFF; + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + bad_aad.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + let result = reader.read(0..plaintext.len() as u64).await; + assert!(result.is_err(), "Decryption with wrong AAD should fail"); + } + + #[tokio::test] + async fn test_wrong_key_fails() { + let key = b"0123456789abcdef"; + let wrong_key = b"fedcba9876543210"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"sensitive data"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(wrong_key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + let result = reader.read(0..plaintext.len() as u64).await; + assert!(result.is_err(), "Decryption with wrong key should fail"); + } + + #[tokio::test] + async fn test_out_of_bounds_read() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"short data"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + let result = reader.read(0..plaintext.len() as u64 + 1).await; + assert!(result.is_err(), "Reading past end should fail"); + } + + #[tokio::test] + async fn test_calculate_plaintext_length() { + // Empty file: header only (not valid per Java, but handled) + assert_eq!( + AesGcmFileRead::calculate_plaintext_length(GCM_STREAM_HEADER_LENGTH as u64).unwrap(), + 0 + ); + + // Empty file with one empty block: header(8) + nonce(12) + tag(16) = 36 + assert_eq!( + AesGcmFileRead::calculate_plaintext_length(MIN_STREAM_LENGTH as u64).unwrap(), + 0 + ); + + // One full block: header(8) + cipher_block(1048604) = 1048612 + let one_full = GCM_STREAM_HEADER_LENGTH as u64 + CIPHER_BLOCK_SIZE as u64; + assert_eq!( + AesGcmFileRead::calculate_plaintext_length(one_full).unwrap(), + PLAIN_BLOCK_SIZE as u64 + ); + + // One full block + 1 byte: need partial second block + // Second block = nonce(12) + 1 byte ciphertext + tag(16) = 29 + let one_full_plus_one = one_full + NONCE_LENGTH as u64 + 1 + GCM_TAG_LENGTH as u64; + assert_eq!( + AesGcmFileRead::calculate_plaintext_length(one_full_plus_one).unwrap(), + PLAIN_BLOCK_SIZE as u64 + 1 + ); + } + + #[tokio::test] + async fn test_stream_block_aad() { + // With prefix + let aad = stream_block_aad(b"prefix", 0); + assert_eq!(&aad[..6], b"prefix"); + assert_eq!(&aad[6..], &0u32.to_le_bytes()); + + let aad = stream_block_aad(b"prefix", 1); + assert_eq!(&aad[..6], b"prefix"); + assert_eq!(&aad[6..], &1u32.to_le_bytes()); + + // Without prefix + let aad = stream_block_aad(b"", 42); + assert_eq!(&aad[..], &42u32.to_le_bytes()); + } + + #[tokio::test] + async fn test_encrypted_file_too_short() { + let result = AesGcmFileRead::new( + memory_reader(vec![0; 4]), + Arc::new(make_cipher(b"0123456789abcdef")), + [].into(), + 4, + ); + assert!(result.is_err()); + } + + // --- AesGcmFileWrite tests --- + + /// Shared-buffer FileWrite for testing AesGcmFileWrite output. + struct SharedMemoryWrite { + buffer: std::sync::Arc>>, + } + + /// FileWrite that fails after a configured number of successful writes. + struct FailingFileWrite { + writes_before_failure: usize, + write_count: usize, + } + + #[async_trait::async_trait] + impl FileWrite for FailingFileWrite { + async fn write(&mut self, _bs: Bytes) -> Result<()> { + if self.write_count >= self.writes_before_failure { + return Err(Error::new(ErrorKind::Unexpected, "simulated write failure")); + } + self.write_count += 1; + Ok(()) + } + + async fn close(&mut self) -> Result<()> { + Ok(()) + } + } + + #[async_trait::async_trait] + impl FileWrite for SharedMemoryWrite { + async fn write(&mut self, bs: Bytes) -> Result<()> { + self.buffer.lock().unwrap().extend_from_slice(&bs); + Ok(()) + } + + async fn close(&mut self) -> Result<()> { + Ok(()) + } + } + + /// Helper: one-shot encrypt through AesGcmFileWrite, return encrypted bytes. + async fn write_through_ags1(plaintext: &[u8], key: &[u8], aad_prefix: &[u8]) -> Vec { + let buffer = std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let inner: Box = Box::new(SharedMemoryWrite { + buffer: buffer.clone(), + }); + let cipher = Arc::new(make_cipher(key)); + let mut writer = AesGcmFileWrite::new(inner, cipher, aad_prefix.to_vec()); + + writer.write(Bytes::from(plaintext.to_vec())).await.unwrap(); + writer.close().await.unwrap(); + + buffer.lock().unwrap().clone() + } + + #[tokio::test] + async fn test_write_empty_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + + let encrypted = write_through_ags1(b"", key, aad_prefix).await; + + // Should produce header + one empty encrypted block + assert_eq!(encrypted.len(), MIN_STREAM_LENGTH as usize); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), 0); + } + + #[tokio::test] + async fn test_write_small_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"Hello, Iceberg encryption!"; + + let encrypted = write_through_ags1(plaintext, key, aad_prefix).await; + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), plaintext.len() as u64); + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], plaintext); + } + + #[tokio::test] + async fn test_write_multi_block_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"multi-block-aad!"; + + // 1.5 blocks of data + let size = PLAIN_BLOCK_SIZE as usize + PLAIN_BLOCK_SIZE as usize / 2; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + + let encrypted = write_through_ags1(&plaintext, key, aad_prefix).await; + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), plaintext.len() as u64); + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_write_cross_block_accumulation() { + let key = b"0123456789abcdef"; + let aad_prefix = b"cross-block-aad!"; + + let buffer = std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let inner: Box = Box::new(SharedMemoryWrite { + buffer: buffer.clone(), + }); + let cipher = Arc::new(make_cipher(key)); + let mut writer = AesGcmFileWrite::new(inner, cipher, aad_prefix.to_vec()); + + // Write 1.5 blocks in 1000-byte chunks + let total_size = PLAIN_BLOCK_SIZE as usize + PLAIN_BLOCK_SIZE as usize / 2; + let plaintext: Vec = (0..total_size).map(|i| (i % 256) as u8).collect(); + let chunk_size = 1000; + for chunk in plaintext.chunks(chunk_size) { + writer.write(Bytes::from(chunk.to_vec())).await.unwrap(); + } + writer.close().await.unwrap(); + + let encrypted = buffer.lock().unwrap().clone(); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), plaintext.len() as u64); + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_write_exact_block_size() { + let key = b"0123456789abcdef"; + let aad_prefix = b"exact-block-aad!"; + + // Exactly 1 block + let plaintext: Vec = (0..PLAIN_BLOCK_SIZE as usize) + .map(|i| (i % 256) as u8) + .collect(); + + let encrypted = write_through_ags1(&plaintext, key, aad_prefix).await; + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), PLAIN_BLOCK_SIZE as u64); + let result = reader.read(0..PLAIN_BLOCK_SIZE as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_write_block_aligned_no_spurious_empty_block() { + let key = b"0123456789abcdef"; + let aad_prefix = b"block-align-aad!"; + + // Write exactly one block of plaintext — close() should NOT add + // a trailing empty encrypted block (28 bytes: 12-byte nonce + 16-byte tag). + let plaintext: Vec = (0..PLAIN_BLOCK_SIZE as usize) + .map(|i| (i % 256) as u8) + .collect(); + + let encrypted_via_writer = write_through_ags1(&plaintext, key, aad_prefix).await; + let encrypted_via_reference = encrypt_ags1(&plaintext, &make_cipher(key), aad_prefix); + + // Both should be the same length — no extra 28-byte empty block + assert_eq!( + encrypted_via_writer.len(), + encrypted_via_reference.len(), + "Writer output should match reference encryption length (no spurious trailing block)" + ); + + // Verify roundtrip + let reader = AesGcmFileRead::new( + memory_reader(encrypted_via_writer.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted_via_writer.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), PLAIN_BLOCK_SIZE as u64); + let result = reader.read(0..PLAIN_BLOCK_SIZE as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_write_two_blocks_aligned_no_spurious_empty_block() { + let key = b"0123456789abcdef"; + let aad_prefix = b"2blk-align-aad!!"; + + // Exactly 2 blocks + let size = PLAIN_BLOCK_SIZE as usize * 2; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + + let encrypted_via_writer = write_through_ags1(&plaintext, key, aad_prefix).await; + let encrypted_via_reference = encrypt_ags1(&plaintext, &make_cipher(key), aad_prefix); + + assert_eq!( + encrypted_via_writer.len(), + encrypted_via_reference.len(), + "Writer output should match reference encryption length (no spurious trailing block)" + ); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted_via_writer.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted_via_writer.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), size as u64); + let result = reader.read(0..size as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_write_poisoned_after_inner_write_failure() { + let cipher = Arc::new(make_cipher(b"0123456789abcdef")); + // Fail on the second write (first write is the header, second is block data) + let inner: Box = Box::new(FailingFileWrite { + writes_before_failure: 1, + write_count: 0, + }); + let mut writer = AesGcmFileWrite::new(inner, cipher, b"aad-prefix-here!".to_vec()); + + // First write triggers header (succeeds) + block encrypt+write (fails) + let data = vec![0u8; PLAIN_BLOCK_SIZE as usize]; + let result = writer.write(Bytes::from(data)).await; + assert!(result.is_err()); + + // Subsequent write should be rejected as poisoned + let result = writer.write(Bytes::from(b"more data".to_vec())).await; + assert!(result.is_err()); + assert!( + result.unwrap_err().to_string().contains("poisoned"), + "expected poisoned error" + ); + + // Close should also be rejected + let result = writer.close().await; + assert!(result.is_err()); + assert!( + result.unwrap_err().to_string().contains("poisoned"), + "expected poisoned error on close" + ); + } +} diff --git a/crates/iceberg/src/expr/visitors/page_index_evaluator.rs b/crates/iceberg/src/expr/visitors/page_index_evaluator.rs index 96d1c651cd..4cd676dab1 100644 --- a/crates/iceberg/src/expr/visitors/page_index_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/page_index_evaluator.rs @@ -793,7 +793,7 @@ mod tests { }; use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData}; use parquet::file::properties::WriterProperties; - use rand::{Rng, thread_rng}; + use rand::Rng; use tempfile::NamedTempFile; use super::PageIndexEvaluator; @@ -1284,13 +1284,13 @@ mod tests { #[test] fn eval_in_length_of_set_above_limit_all_rows() -> Result<()> { - let mut rng = thread_rng(); + let mut rng = rand::rng(); let (metadata, _temp_file) = create_test_parquet_file()?; let (column_index, offset_index, row_group_metadata) = get_test_metadata(&metadata); let (iceberg_schema_ref, field_id_map) = build_iceberg_schema_and_field_map()?; let filter = Reference::new("col_float") - .is_in(std::iter::repeat_with(|| Datum::float(rng.gen_range(0.0..10.0))).take(1000)) + .is_in(std::iter::repeat_with(|| Datum::float(rng.random_range(0.0..10.0))).take(1000)) .bind(iceberg_schema_ref.clone(), false)?; let result = PageIndexEvaluator::eval( diff --git a/crates/iceberg/src/expr/visitors/row_group_metrics_evaluator.rs b/crates/iceberg/src/expr/visitors/row_group_metrics_evaluator.rs index 0506b33af0..ad7e19f548 100644 --- a/crates/iceberg/src/expr/visitors/row_group_metrics_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/row_group_metrics_evaluator.rs @@ -528,7 +528,7 @@ mod tests { use parquet::schema::types::{ ColumnDescriptor, ColumnPath, SchemaDescriptor, Type as parquetSchemaType, }; - use rand::{Rng, thread_rng}; + use rand::Rng; use super::RowGroupMetricsEvaluator; use crate::Result; @@ -1617,7 +1617,7 @@ mod tests { #[test] fn eval_true_for_too_many_literals_filter_is_in() -> Result<()> { - let mut rng = thread_rng(); + let mut rng = rand::rng(); let row_group_metadata = create_row_group_metadata( 1, @@ -1636,7 +1636,7 @@ mod tests { let (iceberg_schema_ref, field_id_map) = build_iceberg_schema_and_field_map()?; let filter = Reference::new("col_float") - .is_in(std::iter::repeat_with(|| Datum::float(rng.gen_range(0.0..10.0))).take(1000)) + .is_in(std::iter::repeat_with(|| Datum::float(rng.random_range(0.0..10.0))).take(1000)) .bind(iceberg_schema_ref.clone(), false)?; let result = RowGroupMetricsEvaluator::eval( diff --git a/crates/iceberg/src/io/storage/config/s3.rs b/crates/iceberg/src/io/storage/config/s3.rs index fae3a14757..64db47084e 100644 --- a/crates/iceberg/src/io/storage/config/s3.rs +++ b/crates/iceberg/src/io/storage/config/s3.rs @@ -69,8 +69,14 @@ pub const S3_DISABLE_CONFIG_LOAD: &str = "s3.disable-config-load"; /// /// This struct contains all the configuration options for connecting to Amazon S3. /// Use the builder pattern via `S3Config::builder()` to construct instances. -/// ``` -#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize, TypedBuilder)] +/// +/// Defaults follow the Iceberg `S3FileIOProperties` spec (see +/// [`PATH_STYLE_ACCESS_DEFAULT = false`](https://github.com/apache/iceberg/blob/main/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIOProperties.java)), +/// i.e. virtual-host-style addressing is enabled unless +/// `s3.path-style-access=true` is explicitly set. This matches what +/// Java clients do out of the box and is required for a number of +/// S3-compatible stores that do not support path-style URLs. +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, TypedBuilder)] pub struct S3Config { /// S3 endpoint URL. #[builder(default, setter(strip_option, into))] @@ -88,7 +94,9 @@ pub struct S3Config { #[builder(default, setter(strip_option, into))] pub region: Option, /// Enable virtual host style (opposite of path style access). - #[builder(default)] + /// + /// Defaults to `true` to match Iceberg `S3FileIOProperties.PATH_STYLE_ACCESS_DEFAULT = false`. + #[builder(default = true)] pub enable_virtual_host_style: bool, /// Server side encryption type. #[builder(default, setter(strip_option, into))] @@ -125,6 +133,12 @@ pub struct S3Config { pub disable_config_load: bool, } +impl Default for S3Config { + fn default() -> Self { + Self::builder().build() + } +} + impl TryFrom<&StorageConfig> for S3Config { type Error = crate::Error; @@ -267,6 +281,17 @@ mod tests { assert_eq!(s3_config.region.as_deref(), Some("eu-west-1")); } + #[test] + fn test_s3_config_default_is_virtual_host_style() { + // Matches Iceberg S3FileIOProperties.PATH_STYLE_ACCESS_DEFAULT = false. + assert!(S3Config::default().enable_virtual_host_style); + assert!( + S3Config::try_from(&StorageConfig::new()) + .unwrap() + .enable_virtual_host_style + ); + } + #[test] fn test_s3_config_path_style_access() { let storage_config = StorageConfig::new().with_prop(S3_PATH_STYLE_ACCESS, "true"); diff --git a/crates/iceberg/src/lib.rs b/crates/iceberg/src/lib.rs index 44a3601428..ae0708146b 100644 --- a/crates/iceberg/src/lib.rs +++ b/crates/iceberg/src/lib.rs @@ -95,9 +95,10 @@ pub mod arrow; pub(crate) mod delete_file_index; pub mod encryption; pub mod test_utils; -mod utils; pub mod writer; mod delete_vector; pub mod metadata_columns; pub mod puffin; +/// Utility functions and modules. +pub mod util; diff --git a/crates/iceberg/src/puffin/metadata.rs b/crates/iceberg/src/puffin/metadata.rs index 1d39cf249b..e2dfc10c23 100644 --- a/crates/iceberg/src/puffin/metadata.rs +++ b/crates/iceberg/src/puffin/metadata.rs @@ -985,6 +985,9 @@ mod tests { assert!(result.is_ok()); let metadata = result.unwrap(); assert_eq!(metadata.blobs.len(), 1); - assert_eq!(metadata.blobs[0].compression_codec, CompressionCodec::Gzip); + assert_eq!( + metadata.blobs[0].compression_codec, + CompressionCodec::gzip_default() + ); } } diff --git a/crates/iceberg/src/puffin/mod.rs b/crates/iceberg/src/puffin/mod.rs index 854d4070ff..0e054cac51 100644 --- a/crates/iceberg/src/puffin/mod.rs +++ b/crates/iceberg/src/puffin/mod.rs @@ -26,30 +26,22 @@ pub use blob::{APACHE_DATASKETCHES_THETA_V1, Blob, DELETION_VECTOR_V1}; pub use crate::compression::CompressionCodec; -/// Compression codecs supported by the Puffin spec. -const SUPPORTED_PUFFIN_CODECS: &[CompressionCodec] = &[ - CompressionCodec::None, - CompressionCodec::Lz4, - CompressionCodec::Zstd, -]; - /// Validates that the compression codec is supported for Puffin files. /// Returns an error if the codec is not supported. fn validate_puffin_compression(codec: CompressionCodec) -> Result<()> { - if !SUPPORTED_PUFFIN_CODECS.contains(&codec) { - let supported_names: Vec = SUPPORTED_PUFFIN_CODECS - .iter() - .map(|c| format!("{c:?}")) - .collect(); - return Err(Error::new( + match codec { + CompressionCodec::None | CompressionCodec::Lz4 | CompressionCodec::Zstd(_) => Ok(()), + other => Err(Error::new( ErrorKind::DataInvalid, format!( - "Compression codec {codec:?} is not supported for Puffin files. Only {} are supported.", - supported_names.join(", ") + "Compression codec {} is not supported for Puffin files. Only {}, {}, and {} are supported.", + other.name(), + CompressionCodec::None.name(), + CompressionCodec::Lz4.name(), + CompressionCodec::zstd_default().name() ), - )); + )), } - Ok(()) } mod metadata; @@ -70,12 +62,13 @@ mod tests { #[test] fn test_puffin_codec_validation() { - // All codecs in SUPPORTED_PUFFIN_CODECS should be valid - for codec in SUPPORTED_PUFFIN_CODECS { - assert!(validate_puffin_compression(*codec).is_ok()); - } + // Supported codecs + assert!(validate_puffin_compression(CompressionCodec::None).is_ok()); + assert!(validate_puffin_compression(CompressionCodec::Lz4).is_ok()); + assert!(validate_puffin_compression(CompressionCodec::zstd_default()).is_ok()); + assert!(validate_puffin_compression(CompressionCodec::Zstd(5)).is_ok()); - // Gzip should not be supported for Puffin files - assert!(validate_puffin_compression(CompressionCodec::Gzip).is_err()); + // Unsupported codecs + assert!(validate_puffin_compression(CompressionCodec::gzip_default()).is_err()); } } diff --git a/crates/iceberg/src/puffin/reader.rs b/crates/iceberg/src/puffin/reader.rs index d272f02d41..0aced4186f 100644 --- a/crates/iceberg/src/puffin/reader.rs +++ b/crates/iceberg/src/puffin/reader.rs @@ -144,7 +144,7 @@ mod tests { sequence_number: 1, offset: 4, length: 10, - compression_codec: CompressionCodec::Gzip, + compression_codec: CompressionCodec::gzip_default(), properties: HashMap::new(), }; @@ -153,7 +153,7 @@ mod tests { assert!(result.is_err()); let err = result.unwrap_err(); assert_eq!(err.kind(), ErrorKind::DataInvalid); - assert!(err.to_string().contains("Gzip")); + assert!(err.to_string().contains("gzip")); assert!( err.to_string() .contains("is not supported for Puffin files") diff --git a/crates/iceberg/src/puffin/test_utils.rs b/crates/iceberg/src/puffin/test_utils.rs index 39fecc6f80..e0844e2002 100644 --- a/crates/iceberg/src/puffin/test_utils.rs +++ b/crates/iceberg/src/puffin/test_utils.rs @@ -77,7 +77,7 @@ pub(crate) fn zstd_compressed_metric_blob_0_metadata() -> BlobMetadata { sequence_number: METRIC_BLOB_0_SEQUENCE_NUMBER, offset: 4, length: 22, - compression_codec: CompressionCodec::Zstd, + compression_codec: CompressionCodec::zstd_default(), properties: HashMap::new(), } } @@ -134,7 +134,7 @@ pub(crate) fn zstd_compressed_metric_blob_1_metadata() -> BlobMetadata { sequence_number: METRIC_BLOB_1_SEQUENCE_NUMBER, offset: 26, length: 77, - compression_codec: CompressionCodec::Zstd, + compression_codec: CompressionCodec::zstd_default(), properties: HashMap::new(), } } diff --git a/crates/iceberg/src/puffin/writer.rs b/crates/iceberg/src/puffin/writer.rs index 30b97f09dd..4af4970b04 100644 --- a/crates/iceberg/src/puffin/writer.rs +++ b/crates/iceberg/src/puffin/writer.rs @@ -251,7 +251,8 @@ mod tests { async fn test_write_zstd_compressed_metric_data() { let temp_dir = TempDir::new().unwrap(); let blobs = vec![blob_0(), blob_1()]; - let blobs_with_compression = blobs_with_compression(blobs.clone(), CompressionCodec::Zstd); + let blobs_with_compression = + blobs_with_compression(blobs.clone(), CompressionCodec::zstd_default()); let input_file = write_puffin_file(&temp_dir, blobs_with_compression, file_properties()) .await @@ -323,7 +324,8 @@ mod tests { async fn test_zstd_compressed_metric_data_is_bit_identical_to_java_generated_file() { let temp_dir = TempDir::new().unwrap(); let blobs = vec![blob_0(), blob_1()]; - let blobs_with_compression = blobs_with_compression(blobs, CompressionCodec::Zstd); + let blobs_with_compression = + blobs_with_compression(blobs, CompressionCodec::zstd_default()); assert_files_are_bit_identical( write_puffin_file(&temp_dir, blobs_with_compression, file_properties()) @@ -338,14 +340,15 @@ mod tests { async fn test_gzip_compression_rejected() { let temp_dir = TempDir::new().unwrap(); let blobs = vec![blob_0()]; - let blobs_with_compression = blobs_with_compression(blobs, CompressionCodec::Gzip); + let blobs_with_compression = + blobs_with_compression(blobs, CompressionCodec::gzip_default()); let result = write_puffin_file(&temp_dir, blobs_with_compression, file_properties()).await; assert!(result.is_err()); let err = result.unwrap_err(); assert_eq!(err.kind(), ErrorKind::DataInvalid); - assert!(err.to_string().contains("Gzip")); + assert!(err.to_string().contains("gzip")); assert!( err.to_string() .contains("is not supported for Puffin files") diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index e52b3bdeae..4a1e27bdc1 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -40,7 +40,7 @@ use crate::metadata_columns::{get_metadata_field_id, is_metadata_column_name}; use crate::runtime::spawn; use crate::spec::{DataContentType, SnapshotRef}; use crate::table::Table; -use crate::utils::available_parallelism; +use crate::util::available_parallelism; use crate::{Error, ErrorKind, Result}; /// A stream of arrow [`RecordBatch`]es. @@ -683,6 +683,39 @@ pub mod tests { } } + /// Creates a fixture with 5 snapshots chained as: + /// S1 (root) -> S2 -> S3 -> S4 -> S5 (current) + /// Useful for testing snapshot history traversal. + pub fn new_with_deep_history() -> Self { + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().join("table1"); + let table_metadata1_location = table_location.join("metadata/v1.json"); + + let file_io = FileIO::new_with_fs(); + + let table_metadata = { + let json_str = fs::read_to_string(format!( + "{}/testdata/example_table_metadata_v2_deep_history.json", + env!("CARGO_MANIFEST_DIR") + )) + .unwrap(); + serde_json::from_str::(&json_str).unwrap() + }; + + let table = Table::builder() + .metadata(table_metadata) + .identifier(TableIdent::from_strs(["db", "table1"]).unwrap()) + .file_io(file_io.clone()) + .metadata_location(table_metadata1_location.as_os_str().to_str().unwrap()) + .build() + .unwrap(); + + Self { + table_location: table_location.to_str().unwrap().to_string(), + table, + } + } + pub fn new_unpartitioned() -> Self { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().join("table1"); diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index cc5ef737fb..1b3b605fd8 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -32,10 +32,14 @@ use crate::spec::manifest::_serde::{ManifestEntryV1, ManifestEntryV2}; use crate::spec::manifest::{manifest_schema_v1, manifest_schema_v2}; use crate::spec::{ DataContentType, DataFile, FieldSummary, ManifestEntry, ManifestFile, ManifestMetadata, - ManifestStatus, PrimitiveLiteral, SchemaRef, StructType, UNASSIGNED_SNAPSHOT_ID, + ManifestStatus, PrimitiveLiteral, SchemaRef, StructType, }; use crate::{Error, ErrorKind}; +/// Placeholder for snapshot ID. The field with this value must be replaced +/// with the actual snapshot ID before it is committed. +const UNASSIGNED_SNAPSHOT_ID: i64 = -1; + /// The builder used to create a [`ManifestWriter`]. pub struct ManifestWriterBuilder { output: OutputFile, diff --git a/crates/iceberg/src/spec/snapshot.rs b/crates/iceberg/src/spec/snapshot.rs index f60579e014..72b5417c47 100644 --- a/crates/iceberg/src/spec/snapshot.rs +++ b/crates/iceberg/src/spec/snapshot.rs @@ -33,8 +33,6 @@ use crate::{Error, ErrorKind}; /// The ref name of the main branch of the table. pub const MAIN_BRANCH: &str = "main"; -/// Placeholder for snapshot ID. The field with this value must be replaced with the actual snapshot ID before it is committed. -pub const UNASSIGNED_SNAPSHOT_ID: i64 = -1; /// Reference to [`Snapshot`]. pub type SnapshotRef = Arc; diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index b91599b74f..607fd98350 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -47,6 +47,9 @@ use crate::{Error, ErrorKind}; static MAIN_BRANCH: &str = "main"; pub(crate) static ONE_MINUTE_MS: i64 = 60_000; +/// Sentinel value used by the Java implementation and older metadata files +/// to represent a missing/empty current snapshot ID. During deserialization, +/// this value is normalized to `None`. pub(crate) static EMPTY_SNAPSHOT_ID: i64 = -1; pub(crate) static INITIAL_SEQUENCE_NUMBER: i64 = 0; @@ -457,7 +460,7 @@ impl TableMetadata { && metadata_content[0] == 0x1F && metadata_content[1] == 0x8B { - let decompressed_data = CompressionCodec::Gzip + let decompressed_data = CompressionCodec::gzip_default() .decompress(metadata_content.to_vec()) .map_err(|e| { Error::new( @@ -499,7 +502,7 @@ impl TableMetadata { // Apply compression based on codec let data_to_write = match codec { - CompressionCodec::Gzip => codec.compress(json_data)?, + CompressionCodec::Gzip(_) => codec.compress(json_data)?, CompressionCodec::None => json_data, _ => { return Err(Error::new( @@ -765,8 +768,8 @@ pub(super) mod _serde { use uuid::Uuid; use super::{ - DEFAULT_PARTITION_SPEC_ID, FormatVersion, MAIN_BRANCH, MetadataLog, SnapshotLog, - TableMetadata, + DEFAULT_PARTITION_SPEC_ID, EMPTY_SNAPSHOT_ID, FormatVersion, MAIN_BRANCH, MetadataLog, + SnapshotLog, TableMetadata, }; use crate::spec::schema::_serde::{SchemaV1, SchemaV2}; use crate::spec::snapshot::_serde::{SnapshotV1, SnapshotV2, SnapshotV3}; @@ -950,7 +953,7 @@ pub(super) mod _serde { encryption_keys, snapshots, } = value; - let current_snapshot_id = if let &Some(-1) = &value.current_snapshot_id { + let current_snapshot_id = if value.current_snapshot_id == Some(EMPTY_SNAPSHOT_ID) { None } else { value.current_snapshot_id @@ -1063,7 +1066,7 @@ pub(super) mod _serde { fn try_from(value: TableMetadataV2) -> Result { let snapshots = value.snapshots; let value = value.shared; - let current_snapshot_id = if let &Some(-1) = &value.current_snapshot_id { + let current_snapshot_id = if value.current_snapshot_id == Some(EMPTY_SNAPSHOT_ID) { None } else { value.current_snapshot_id @@ -1170,7 +1173,7 @@ pub(super) mod _serde { impl TryFrom for TableMetadata { type Error = Error; fn try_from(value: TableMetadataV1) -> Result { - let current_snapshot_id = if let &Some(-1) = &value.current_snapshot_id { + let current_snapshot_id = if value.current_snapshot_id == Some(EMPTY_SNAPSHOT_ID) { None } else { value.current_snapshot_id @@ -3300,6 +3303,18 @@ mod tests { check_table_metadata_serde(&metadata, expected); } + #[test] + fn test_empty_snapshot_id_is_normalized_to_none() { + let metadata = + fs::read_to_string("testdata/table_metadata/TableMetadataV1Valid.json").unwrap(); + let deserialized: TableMetadata = serde_json::from_str(&metadata).unwrap(); + assert_eq!( + deserialized.current_snapshot_id(), + None, + "current_snapshot_id of -1 should be deserialized as None" + ); + } + #[test] fn test_table_metadata_v1_compat() { let metadata = @@ -3618,7 +3633,7 @@ mod tests { let original_metadata: TableMetadata = get_test_table_metadata("TableMetadataV2Valid.json"); let json = serde_json::to_string(&original_metadata).unwrap(); - let compressed = CompressionCodec::Gzip + let compressed = CompressionCodec::gzip_default() .compress(json.into_bytes()) .expect("failed to compress metadata"); std::fs::write(&metadata_location, &compressed).expect("failed to write metadata"); diff --git a/crates/iceberg/src/spec/table_metadata_builder.rs b/crates/iceberg/src/spec/table_metadata_builder.rs index 62311a15a2..65dbae1bfc 100644 --- a/crates/iceberg/src/spec/table_metadata_builder.rs +++ b/crates/iceberg/src/spec/table_metadata_builder.rs @@ -570,7 +570,7 @@ impl TableMetadataBuilder { /// Remove a reference /// - /// If `ref_name='main'` the current snapshot id is set to -1. + /// If `ref_name='main'` the current snapshot id is set to `None`. pub fn remove_ref(mut self, ref_name: &str) -> Self { if ref_name == MAIN_BRANCH { self.metadata.current_snapshot_id = None; diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index 07c157304e..a3d4e7fdaa 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -78,18 +78,22 @@ pub(crate) fn parse_metadata_file_compression( Error::new( ErrorKind::DataInvalid, format!( - "Invalid metadata compression codec: {value}. Only 'none' and 'gzip' are supported." + "Invalid metadata compression codec: {value}. Only '{}' and '{}' are supported.", + CompressionCodec::None.name(), + CompressionCodec::gzip_default().name() ), ) })?; // Validate that only None and Gzip are used for metadata match codec { - CompressionCodec::None | CompressionCodec::Gzip => Ok(codec), - CompressionCodec::Lz4 | CompressionCodec::Zstd => Err(Error::new( + CompressionCodec::None | CompressionCodec::Gzip(_) => Ok(codec), + _ => Err(Error::new( ErrorKind::DataInvalid, format!( - "Invalid metadata compression codec: {value}. Only 'none' and 'gzip' are supported for metadata files." + "Invalid metadata compression codec: {value}. Only '{}' and '{}' are supported for metadata files.", + CompressionCodec::None.name(), + CompressionCodec::gzip_default().name() ), )), } @@ -324,7 +328,7 @@ mod tests { let table_properties = TableProperties::try_from(&props).unwrap(); assert_eq!( table_properties.metadata_compression_codec, - CompressionCodec::Gzip + CompressionCodec::gzip_default() ); } @@ -351,7 +355,7 @@ mod tests { let table_properties = TableProperties::try_from(&props_upper).unwrap(); assert_eq!( table_properties.metadata_compression_codec, - CompressionCodec::Gzip + CompressionCodec::gzip_default() ); // Test mixed case @@ -362,7 +366,7 @@ mod tests { let table_properties = TableProperties::try_from(&props_mixed).unwrap(); assert_eq!( table_properties.metadata_compression_codec, - CompressionCodec::Gzip + CompressionCodec::gzip_default() ); // Test "NONE" should also be case-insensitive @@ -517,7 +521,7 @@ mod tests { )]); assert_eq!( parse_metadata_file_compression(&props).unwrap(), - CompressionCodec::Gzip + CompressionCodec::gzip_default() ); // Test case insensitivity - "NONE" @@ -537,7 +541,7 @@ mod tests { )]); assert_eq!( parse_metadata_file_compression(&props).unwrap(), - CompressionCodec::Gzip + CompressionCodec::gzip_default() ); // Test case insensitivity - "GzIp" @@ -547,7 +551,7 @@ mod tests { )]); assert_eq!( parse_metadata_file_compression(&props).unwrap(), - CompressionCodec::Gzip + CompressionCodec::gzip_default() ); // Test default when property is missing diff --git a/crates/iceberg/src/utils.rs b/crates/iceberg/src/util/mod.rs similarity index 96% rename from crates/iceberg/src/utils.rs rename to crates/iceberg/src/util/mod.rs index 00d3e69bd3..28eda66d49 100644 --- a/crates/iceberg/src/utils.rs +++ b/crates/iceberg/src/util/mod.rs @@ -17,6 +17,9 @@ use std::num::NonZeroUsize; +/// Utilities for working with snapshots. +pub mod snapshot; + // Use a default value of 1 as the safest option. // See https://doc.rust-lang.org/std/thread/fn.available_parallelism.html#limitations // for more details. diff --git a/crates/iceberg/src/util/snapshot.rs b/crates/iceberg/src/util/snapshot.rs new file mode 100644 index 0000000000..98997ae815 --- /dev/null +++ b/crates/iceberg/src/util/snapshot.rs @@ -0,0 +1,185 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::spec::{SnapshotRef, TableMetadataRef}; + +struct Ancestors { + next: Option, + get_snapshot: Box Option + Send>, +} + +impl Iterator for Ancestors { + type Item = SnapshotRef; + + fn next(&mut self) -> Option { + let snapshot = self.next.take()?; + self.next = snapshot + .parent_snapshot_id() + .and_then(|id| (self.get_snapshot)(id)); + Some(snapshot) + } +} + +/// Iterate starting from `snapshot_id` (inclusive) to the root snapshot. +pub fn ancestors_of( + table_metadata: &TableMetadataRef, + snapshot_id: i64, +) -> impl Iterator + Send { + let initial = table_metadata.snapshot_by_id(snapshot_id).cloned(); + let table_metadata = table_metadata.clone(); + Ancestors { + next: initial, + get_snapshot: Box::new(move |id| table_metadata.snapshot_by_id(id).cloned()), + } +} + +/// Iterate starting from `latest_snapshot_id` (inclusive) to `oldest_snapshot_id` (exclusive). +pub fn ancestors_between( + table_metadata: &TableMetadataRef, + latest_snapshot_id: i64, + oldest_snapshot_id: Option, +) -> impl Iterator + Send { + ancestors_of(table_metadata, latest_snapshot_id).take_while(move |snapshot| { + oldest_snapshot_id + .map(|id| snapshot.snapshot_id() != id) + .unwrap_or(true) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::scan::tests::TableTestFixture; + + // Five snapshots chained as: S1 (root) -> S2 -> S3 -> S4 -> S5 (current) + const S1: i64 = 3051729675574597004; + const S2: i64 = 3055729675574597004; + const S3: i64 = 3056729675574597004; + const S4: i64 = 3057729675574597004; + const S5: i64 = 3059729675574597004; + + fn metadata() -> TableMetadataRef { + let fixture = TableTestFixture::new_with_deep_history(); + std::sync::Arc::new(fixture.table.metadata().clone()) + } + + // --- ancestors_of --- + + #[test] + fn test_ancestors_of_nonexistent_snapshot_returns_empty() { + let meta = metadata(); + let ids: Vec = ancestors_of(&meta, 999).map(|s| s.snapshot_id()).collect(); + assert!(ids.is_empty()); + } + + #[test] + fn test_ancestors_of_root_returns_only_root() { + let meta = metadata(); + let ids: Vec = ancestors_of(&meta, S1).map(|s| s.snapshot_id()).collect(); + assert_eq!(ids, vec![S1]); + } + + #[test] + fn test_ancestors_of_leaf_returns_full_chain() { + let meta = metadata(); + let ids: Vec = ancestors_of(&meta, S5).map(|s| s.snapshot_id()).collect(); + assert_eq!(ids, vec![S5, S4, S3, S2, S1]); + } + + #[test] + fn test_ancestors_of_mid_chain_returns_partial_chain() { + let meta = metadata(); + let ids: Vec = ancestors_of(&meta, S3).map(|s| s.snapshot_id()).collect(); + assert_eq!(ids, vec![S3, S2, S1]); + } + + #[test] + fn test_ancestors_of_second_snapshot() { + let meta = metadata(); + let ids: Vec = ancestors_of(&meta, S2).map(|s| s.snapshot_id()).collect(); + assert_eq!(ids, vec![S2, S1]); + } + + // --- ancestors_between --- + + #[test] + fn test_ancestors_between_same_id_returns_empty() { + let meta = metadata(); + let ids: Vec = ancestors_between(&meta, S3, Some(S3)) + .map(|s| s.snapshot_id()) + .collect(); + assert!(ids.is_empty()); + } + + #[test] + fn test_ancestors_between_no_oldest_returns_all_ancestors() { + let meta = metadata(); + let ids: Vec = ancestors_between(&meta, S5, None) + .map(|s| s.snapshot_id()) + .collect(); + assert_eq!(ids, vec![S5, S4, S3, S2, S1]); + } + + #[test] + fn test_ancestors_between_excludes_oldest_snapshot() { + let meta = metadata(); + // S5 down to (but not including) S2 + let ids: Vec = ancestors_between(&meta, S5, Some(S2)) + .map(|s| s.snapshot_id()) + .collect(); + assert_eq!(ids, vec![S5, S4, S3]); + } + + #[test] + fn test_ancestors_between_adjacent_snapshots() { + let meta = metadata(); + // S3 down to (but not including) S2 — only S3 itself + let ids: Vec = ancestors_between(&meta, S3, Some(S2)) + .map(|s| s.snapshot_id()) + .collect(); + assert_eq!(ids, vec![S3]); + } + + #[test] + fn test_ancestors_between_leaf_and_root() { + let meta = metadata(); + // S5 down to (but not including) S1 + let ids: Vec = ancestors_between(&meta, S5, Some(S1)) + .map(|s| s.snapshot_id()) + .collect(); + assert_eq!(ids, vec![S5, S4, S3, S2]); + } + + #[test] + fn test_ancestors_between_nonexistent_oldest_returns_full_chain() { + let meta = metadata(); + // oldest_snapshot_id doesn't exist in the chain, so take_while never stops + let ids: Vec = ancestors_between(&meta, S5, Some(999)) + .map(|s| s.snapshot_id()) + .collect(); + assert_eq!(ids, vec![S5, S4, S3, S2, S1]); + } + + #[test] + fn test_ancestors_between_nonexistent_latest_returns_empty() { + let meta = metadata(); + let ids: Vec = ancestors_between(&meta, 999, Some(S1)) + .map(|s| s.snapshot_id()) + .collect(); + assert!(ids.is_empty()); + } +} diff --git a/crates/iceberg/src/writer/file_writer/rolling_writer.rs b/crates/iceberg/src/writer/file_writer/rolling_writer.rs index b86f6a2ea7..b0b2d2f191 100644 --- a/crates/iceberg/src/writer/file_writer/rolling_writer.rs +++ b/crates/iceberg/src/writer/file_writer/rolling_writer.rs @@ -399,7 +399,7 @@ mod tests { "Kelly", "Larry", "Mallory", "Shawn", ]; - let mut rng = rand::thread_rng(); + let mut rng = rand::rng(); let batch_num = 10; let batch_rows = 100; let expected_rows = batch_num * batch_rows; diff --git a/crates/iceberg/testdata/example_table_metadata_v2_deep_history.json b/crates/iceberg/testdata/example_table_metadata_v2_deep_history.json new file mode 100644 index 0000000000..a354958697 --- /dev/null +++ b/crates/iceberg/testdata/example_table_metadata_v2_deep_history.json @@ -0,0 +1,104 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 1, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + {"id": 1, "name": "x", "required": true, "type": "long"} + ] + }, + { + "type": "struct", + "schema-id": 1, + "identifier-field-ids": [1, 2], + "fields": [ + {"id": 1, "name": "x", "required": true, "type": "long"}, + {"id": 2, "name": "y", "required": true, "type": "long", "doc": "comment"}, + {"id": 3, "name": "z", "required": true, "type": "long"} + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + {"name": "x", "transform": "identity", "source-id": 1, "field-id": 1000} + ] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + {"transform": "identity", "source-id": 2, "direction": "asc", "null-order": "nulls-first"}, + {"transform": "bucket[4]", "source-id": 3, "direction": "desc", "null-order": "nulls-last"} + ] + } + ], + "properties": {}, + "current-snapshot-id": 3059729675574597004, + "snapshots": [ + { + "snapshot-id": 3051729675574597004, + "timestamp-ms": 1515100955770, + "sequence-number": 0, + "summary": {"operation": "append"}, + "manifest-list": "s3://bucket/metadata/snap-3051729675574597004.avro" + }, + { + "snapshot-id": 3055729675574597004, + "parent-snapshot-id": 3051729675574597004, + "timestamp-ms": 1555100955770, + "sequence-number": 1, + "summary": {"operation": "append"}, + "manifest-list": "s3://bucket/metadata/snap-3055729675574597004.avro", + "schema-id": 1 + }, + { + "snapshot-id": 3056729675574597004, + "parent-snapshot-id": 3055729675574597004, + "timestamp-ms": 1575100955770, + "sequence-number": 2, + "summary": {"operation": "append"}, + "manifest-list": "s3://bucket/metadata/snap-3056729675574597004.avro", + "schema-id": 1 + }, + { + "snapshot-id": 3057729675574597004, + "parent-snapshot-id": 3056729675574597004, + "timestamp-ms": 1595100955770, + "sequence-number": 3, + "summary": {"operation": "overwrite"}, + "manifest-list": "s3://bucket/metadata/snap-3057729675574597004.avro", + "schema-id": 1 + }, + { + "snapshot-id": 3059729675574597004, + "parent-snapshot-id": 3057729675574597004, + "timestamp-ms": 1602638573590, + "sequence-number": 4, + "summary": {"operation": "append"}, + "manifest-list": "s3://bucket/metadata/snap-3059729675574597004.avro", + "schema-id": 1 + } + ], + "snapshot-log": [ + {"snapshot-id": 3051729675574597004, "timestamp-ms": 1515100955770}, + {"snapshot-id": 3055729675574597004, "timestamp-ms": 1555100955770}, + {"snapshot-id": 3056729675574597004, "timestamp-ms": 1575100955770}, + {"snapshot-id": 3057729675574597004, "timestamp-ms": 1595100955770}, + {"snapshot-id": 3059729675574597004, "timestamp-ms": 1602638573590} + ], + "metadata-log": [], + "refs": {"main": {"snapshot-id": 3059729675574597004, "type": "branch"}} +} diff --git a/crates/integration_tests/src/lib.rs b/crates/integration_tests/src/lib.rs index 4bf8f4d19c..feafa3ae9f 100644 --- a/crates/integration_tests/src/lib.rs +++ b/crates/integration_tests/src/lib.rs @@ -18,7 +18,9 @@ use std::collections::HashMap; use std::sync::OnceLock; -use iceberg::io::{S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY}; +use iceberg::io::{ + S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, S3_REGION, S3_SECRET_ACCESS_KEY, +}; use iceberg_catalog_rest::REST_CATALOG_PROP_URI; use iceberg_test_utils::{get_minio_endpoint, get_rest_catalog_endpoint, set_up}; @@ -45,6 +47,7 @@ impl GlobalTestFixture { (S3_ACCESS_KEY_ID.to_string(), "admin".to_string()), (S3_SECRET_ACCESS_KEY.to_string(), "password".to_string()), (S3_REGION.to_string(), "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS.to_string(), "true".to_string()), ]); GlobalTestFixture { catalog_config } diff --git a/crates/integration_tests/tests/common/mod.rs b/crates/integration_tests/tests/common/mod.rs index e49a57465c..b7197a3a46 100644 --- a/crates/integration_tests/tests/common/mod.rs +++ b/crates/integration_tests/tests/common/mod.rs @@ -28,7 +28,6 @@ pub async fn random_ns() -> Namespace { let fixture = get_test_fixture(); let rest_catalog = RestCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .load("rest", fixture.catalog_config.clone()) diff --git a/crates/integration_tests/tests/conflict_commit_test.rs b/crates/integration_tests/tests/conflict_commit_test.rs index 3b1362b95d..af2c7a7779 100644 --- a/crates/integration_tests/tests/conflict_commit_test.rs +++ b/crates/integration_tests/tests/conflict_commit_test.rs @@ -43,7 +43,6 @@ async fn test_append_data_file_conflict() { let fixture = get_test_fixture(); let rest_catalog = RestCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .load("rest", fixture.catalog_config.clone()) diff --git a/crates/integration_tests/tests/read_evolved_schema.rs b/crates/integration_tests/tests/read_evolved_schema.rs index ae25a08987..f7416be2d4 100644 --- a/crates/integration_tests/tests/read_evolved_schema.rs +++ b/crates/integration_tests/tests/read_evolved_schema.rs @@ -34,7 +34,6 @@ async fn test_evolved_schema() { let fixture = get_test_fixture(); let rest_catalog = RestCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .load("rest", fixture.catalog_config.clone()) diff --git a/crates/integration_tests/tests/read_positional_deletes.rs b/crates/integration_tests/tests/read_positional_deletes.rs index d4c4afeaf3..0f79596a12 100644 --- a/crates/integration_tests/tests/read_positional_deletes.rs +++ b/crates/integration_tests/tests/read_positional_deletes.rs @@ -30,7 +30,6 @@ async fn test_read_table_with_positional_deletes() { let fixture = get_test_fixture(); let rest_catalog = RestCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .load("rest", fixture.catalog_config.clone()) diff --git a/crates/integrations/datafusion/src/physical_plan/mod.rs b/crates/integrations/datafusion/src/physical_plan/mod.rs index 5a9845cde0..aeac30de32 100644 --- a/crates/integrations/datafusion/src/physical_plan/mod.rs +++ b/crates/integrations/datafusion/src/physical_plan/mod.rs @@ -26,5 +26,6 @@ pub(crate) mod write; pub(crate) const DATA_FILES_COL_NAME: &str = "data_files"; +pub use expr_to_predicate::convert_filters_to_predicate; pub use project::project_with_partition; pub use scan::IcebergTableScan; diff --git a/crates/storage/opendal/README.md b/crates/storage/opendal/README.md index c5092eb97a..a4ad512e17 100644 --- a/crates/storage/opendal/README.md +++ b/crates/storage/opendal/README.md @@ -61,7 +61,6 @@ use iceberg_storage_opendal::OpenDalStorageFactory; async fn main() -> iceberg::Result<()> { let catalog = RestCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .load( diff --git a/crates/storage/opendal/src/azdls.rs b/crates/storage/opendal/src/azdls.rs index 6251f8cdaa..b47c55d9e7 100644 --- a/crates/storage/opendal/src/azdls.rs +++ b/crates/storage/opendal/src/azdls.rs @@ -91,10 +91,9 @@ pub(crate) fn azdls_config_parse(mut properties: HashMap) -> Res pub(crate) fn azdls_create_operator<'a>( absolute_path: &'a str, config: &AzdlsConfig, - configured_scheme: &AzureStorageScheme, ) -> Result<(opendal::Operator, &'a str)> { let path = absolute_path.parse::()?; - match_path_with_config(&path, config, configured_scheme)?; + match_path_with_config(&path, config)?; let op = azdls_config_build(config, &path)?; @@ -160,18 +159,7 @@ impl FromStr for AzureStorageScheme { } /// Validates whether the given path matches what's configured for the backend. -pub(crate) fn match_path_with_config( - path: &AzureStoragePath, - config: &AzdlsConfig, - configured_scheme: &AzureStorageScheme, -) -> Result<()> { - ensure_data_valid!( - &path.scheme == configured_scheme, - "Storage::Azdls: Scheme mismatch: configured {}, passed {}", - configured_scheme, - path.scheme - ); - +pub(crate) fn match_path_with_config(path: &AzureStoragePath, config: &AzdlsConfig) -> Result<()> { if let Some(ref configured_account_name) = config.account_name { ensure_data_valid!( &path.account_name == configured_account_name, @@ -408,7 +396,6 @@ mod tests { endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), ..Default::default() }, - AzureStorageScheme::Abfss, ), Some(("myfs", "/path/to/file.parquet")), ), @@ -421,33 +408,19 @@ mod tests { endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), ..Default::default() }, - AzureStorageScheme::Abfss, - ), - None, - ), - ( - "different scheme", - ( - "wasbs://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet", - AzdlsConfig { - account_name: Some("myaccount".to_string()), - endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), - ..Default::default() - }, - AzureStorageScheme::Abfss, ), None, ), ( "incompatible scheme for endpoint", ( - "abfs://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet", + // `abfss` implies https; configured endpoint is plain http. + "abfss://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet", AzdlsConfig { account_name: Some("myaccount".to_string()), endpoint: Some("http://myaccount.dfs.core.windows.net".to_string()), ..Default::default() }, - AzureStorageScheme::Abfss, ), None, ), @@ -460,7 +433,6 @@ mod tests { endpoint: Some("https://myaccount.dfs.core.chinacloudapi.cn".to_string()), ..Default::default() }, - AzureStorageScheme::Abfss, ), None, ), @@ -474,14 +446,27 @@ mod tests { endpoint: None, ..Default::default() }, - AzureStorageScheme::Abfs, + ), + Some(("myfs", "/path/to/file.parquet")), + ), + ( + "scheme differs from a previously-configured one is accepted", + ( + // No configured scheme exists anymore; both abfss and wasbs + // should be accepted by the same storage. + "wasbs://myfs@myaccount.blob.core.windows.net/path/to/file.parquet", + AzdlsConfig { + account_name: Some("myaccount".to_string()), + endpoint: Some("https://myaccount.blob.core.windows.net".to_string()), + ..Default::default() + }, ), Some(("myfs", "/path/to/file.parquet")), ), ]; for (name, input, expected) in test_cases { - let result = azdls_create_operator(input.0, &input.1, &input.2); + let result = azdls_create_operator(input.0, &input.1); match expected { Some((expected_filesystem, expected_path)) => { assert!(result.is_ok(), "Test case {name} failed: {result:?}"); diff --git a/crates/storage/opendal/src/lib.rs b/crates/storage/opendal/src/lib.rs index 8160680523..a0336868e3 100644 --- a/crates/storage/opendal/src/lib.rs +++ b/crates/storage/opendal/src/lib.rs @@ -46,7 +46,6 @@ use utils::from_opendal_error; cfg_if! { if #[cfg(feature = "opendal-azdls")] { mod azdls; - use azdls::AzureStorageScheme; use azdls::*; use opendal::services::AzdlsConfig; } @@ -108,9 +107,6 @@ pub enum OpenDalStorageFactory { /// S3 storage factory. #[cfg(feature = "opendal-s3")] S3 { - /// s3 storage could have `s3://` and `s3a://`. - /// Storing the scheme string here to return the correct path. - configured_scheme: String, /// Custom AWS credential loader. #[serde(skip)] customized_credential_load: Option, @@ -123,10 +119,7 @@ pub enum OpenDalStorageFactory { Oss, /// Azure Data Lake Storage factory. #[cfg(feature = "opendal-azdls")] - Azdls { - /// The configured Azure storage scheme. - configured_scheme: AzureStorageScheme, - }, + Azdls, } #[typetag::serde(name = "OpenDalStorageFactory")] @@ -142,10 +135,8 @@ impl StorageFactory for OpenDalStorageFactory { OpenDalStorageFactory::Fs => Ok(Arc::new(OpenDalStorage::LocalFs)), #[cfg(feature = "opendal-s3")] OpenDalStorageFactory::S3 { - configured_scheme, customized_credential_load, } => Ok(Arc::new(OpenDalStorage::S3 { - configured_scheme: configured_scheme.clone(), config: s3_config_parse(config.props().clone())?.into(), customized_credential_load: customized_credential_load.clone(), })), @@ -158,12 +149,9 @@ impl StorageFactory for OpenDalStorageFactory { config: oss_config_parse(config.props().clone())?.into(), })), #[cfg(feature = "opendal-azdls")] - OpenDalStorageFactory::Azdls { configured_scheme } => { - Ok(Arc::new(OpenDalStorage::Azdls { - configured_scheme: configured_scheme.clone(), - config: azdls_config_parse(config.props().clone())?.into(), - })) - } + OpenDalStorageFactory::Azdls => Ok(Arc::new(OpenDalStorage::Azdls { + config: azdls_config_parse(config.props().clone())?.into(), + })), #[cfg(all( not(feature = "opendal-memory"), not(feature = "opendal-fs"), @@ -196,11 +184,11 @@ pub enum OpenDalStorage { #[cfg(feature = "opendal-fs")] LocalFs, /// S3 storage variant. + /// + /// Accepts any S3-family URL (`s3://`, `s3a://`, `s3n://`); the scheme is + /// derived from the path at call time. #[cfg(feature = "opendal-s3")] S3 { - /// s3 storage could have `s3://` and `s3a://`. - /// Storing the scheme string here to return the correct path. - configured_scheme: String, /// S3 configuration. config: Arc, /// Custom AWS credential loader. @@ -220,16 +208,13 @@ pub enum OpenDalStorage { config: Arc, }, /// Azure Data Lake Storage variant. - /// Expects paths of the form + /// + /// Accepts paths of the form /// `abfs[s]://@.dfs./` or /// `wasb[s]://@.blob./`. + /// The scheme is derived from the path at call time. #[cfg(feature = "opendal-azdls")] - #[allow(private_interfaces)] Azdls { - /// The configured Azure storage scheme. - /// Because Azdls accepts multiple possible schemes, we store the full - /// passed scheme here to later validate schemes passed via paths. - configured_scheme: AzureStorageScheme, /// Azure DLS configuration. config: Arc, }, @@ -274,15 +259,21 @@ impl OpenDalStorage { } #[cfg(feature = "opendal-s3")] OpenDalStorage::S3 { - configured_scheme, config, customized_credential_load, } => { let op = s3_config_build(config, customized_credential_load, path)?; let op_info = op.info(); - // Check prefix of s3 path. - let prefix = format!("{}://{}/", configured_scheme, op_info.name()); + // Use the URL scheme in the path for prefix matching. This enables + // use of S3-compatible storage backends using custom schemes (e.g., `minio://`, `r2://`). + let url = url::Url::parse(path).map_err(|e| { + Error::new( + ErrorKind::DataInvalid, + format!("Invalid s3 url: {path}: {e}"), + ) + })?; + let prefix = format!("{}://{}/", url.scheme(), op_info.name()); if path.starts_with(&prefix) { (op, &path[prefix.len()..]) } else { @@ -319,10 +310,7 @@ impl OpenDalStorage { } } #[cfg(feature = "opendal-azdls")] - OpenDalStorage::Azdls { - configured_scheme, - config, - } => azdls_create_operator(path, config, configured_scheme)?, + OpenDalStorage::Azdls { config } => azdls_create_operator(path, config)?, #[cfg(all( not(feature = "opendal-s3"), not(feature = "opendal-fs"), @@ -357,9 +345,7 @@ impl OpenDalStorage { #[cfg(feature = "opendal-fs")] OpenDalStorage::LocalFs => Ok(path.strip_prefix("file:/").unwrap_or(&path[1..])), #[cfg(feature = "opendal-s3")] - OpenDalStorage::S3 { - configured_scheme, .. - } => { + OpenDalStorage::S3 { .. } => { let url = url::Url::parse(path)?; let bucket = url.host_str().ok_or_else(|| { Error::new( @@ -367,7 +353,7 @@ impl OpenDalStorage { format!("Invalid s3 url: {path}, missing bucket"), ) })?; - let prefix = format!("{}://{}/", configured_scheme, bucket); + let prefix = format!("{}://{}/", url.scheme(), bucket); if path.starts_with(&prefix) { Ok(&path[prefix.len()..]) } else { @@ -416,12 +402,9 @@ impl OpenDalStorage { } } #[cfg(feature = "opendal-azdls")] - OpenDalStorage::Azdls { - configured_scheme, - config, - } => { + OpenDalStorage::Azdls { config } => { let azure_path = path.parse::()?; - match_path_with_config(&azure_path, config, configured_scheme)?; + match_path_with_config(&azure_path, config)?; let relative_path_len = azure_path.path.len(); Ok(&path[path.len() - relative_path_len..]) } @@ -631,47 +614,21 @@ mod tests { #[test] fn test_relativize_path_s3() { let storage = OpenDalStorage::S3 { - configured_scheme: "s3".to_string(), config: Arc::new(S3Config::default()), customized_credential_load: None, }; - assert_eq!( - storage - .relativize_path("s3://my-bucket/path/to/file.parquet") - .unwrap(), - "path/to/file.parquet" - ); - - // s3a scheme - let storage_s3a = OpenDalStorage::S3 { - configured_scheme: "s3a".to_string(), - config: Arc::new(S3Config::default()), - customized_credential_load: None, - }; - assert_eq!( - storage_s3a - .relativize_path("s3a://my-bucket/path/to/file.parquet") - .unwrap(), - "path/to/file.parquet" - ); - } - - #[cfg(feature = "opendal-s3")] - #[test] - fn test_relativize_path_s3_scheme_mismatch() { - let storage = OpenDalStorage::S3 { - configured_scheme: "s3".to_string(), - config: Arc::new(S3Config::default()), - customized_credential_load: None, - }; - - // Scheme mismatch should error - assert!( - storage - .relativize_path("s3a://my-bucket/path/to/file.parquet") - .is_err() - ); + // All S3-family schemes are accepted by the same storage instance. + // Custom schemes for S3-compatible stores (e.g., `minio://`) are also + // accepted because the path's scheme is used as-is for prefix matching. + for scheme in ["s3", "s3a", "s3n", "minio"] { + assert_eq!( + storage + .relativize_path(&format!("{scheme}://my-bucket/path/to/file.parquet")) + .unwrap(), + "path/to/file.parquet" + ); + } } #[cfg(feature = "opendal-gcs")] @@ -736,7 +693,6 @@ mod tests { #[test] fn test_relativize_path_azdls() { let storage = OpenDalStorage::Azdls { - configured_scheme: AzureStorageScheme::Abfss, config: Arc::new(AzdlsConfig { account_name: Some("myaccount".to_string()), endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), @@ -751,24 +707,4 @@ mod tests { "/path/to/file.parquet" ); } - - #[cfg(feature = "opendal-azdls")] - #[test] - fn test_relativize_path_azdls_scheme_mismatch() { - let storage = OpenDalStorage::Azdls { - configured_scheme: AzureStorageScheme::Abfss, - config: Arc::new(AzdlsConfig { - account_name: Some("myaccount".to_string()), - endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), - ..Default::default() - }), - }; - - // wasbs scheme doesn't match configured abfss - assert!( - storage - .relativize_path("wasbs://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet") - .is_err() - ); - } } diff --git a/crates/storage/opendal/src/resolving.rs b/crates/storage/opendal/src/resolving.rs index 7c06cf96a5..64a16b18d2 100644 --- a/crates/storage/opendal/src/resolving.rs +++ b/crates/storage/opendal/src/resolving.rs @@ -70,29 +70,28 @@ fn parse_scheme(scheme: &str) -> Result { } } -/// Extract the scheme string from a path URL. -fn extract_scheme(path: &str) -> Result { +/// Extract the [`Scheme`] family from a path URL. +fn extract_scheme(path: &str) -> Result { let url = Url::parse(path).map_err(|e| { Error::new( ErrorKind::DataInvalid, format!("Invalid path: {path}, failed to parse URL: {e}"), ) })?; - Ok(url.scheme().to_string()) + parse_scheme(url.scheme()) } /// Build an [`OpenDalStorage`] variant for the given scheme and config properties. fn build_storage_for_scheme( - scheme: &str, + scheme: Scheme, props: &HashMap, #[cfg(feature = "opendal-s3")] customized_credential_load: &Option, ) -> Result { - match parse_scheme(scheme)? { + match scheme { #[cfg(feature = "opendal-s3")] Scheme::S3 => { let config = crate::s3::s3_config_parse(props.clone())?; Ok(OpenDalStorage::S3 { - configured_scheme: scheme.to_string(), config: Arc::new(config), customized_credential_load: customized_credential_load.clone(), }) @@ -113,10 +112,8 @@ fn build_storage_for_scheme( } #[cfg(feature = "opendal-azdls")] Scheme::Azdls => { - let configured_scheme: crate::azdls::AzureStorageScheme = scheme.parse()?; let config = crate::azdls::azdls_config_parse(props.clone())?; Ok(OpenDalStorage::Azdls { - configured_scheme, config: Arc::new(config), }) } @@ -196,14 +193,15 @@ impl StorageFactory for OpenDalResolvingStorageFactory { /// to the appropriate [`OpenDalStorage`] variant. /// /// Sub-storages are lazily created on first use for each scheme and cached -/// for subsequent operations. +/// for subsequent operations. Scheme aliases like `s3`/`s3a`/`s3n` map to +/// the same [`Scheme`] variant, so they share a storage instance. #[derive(Debug, Serialize, Deserialize)] pub struct OpenDalResolvingStorage { /// Configuration properties shared across all backends. props: HashMap, - /// Cache of scheme → storage mappings. + /// Cache of scheme to storage mappings. #[serde(skip, default)] - storages: RwLock>>, + storages: RwLock>>, /// Custom AWS credential loader for S3 storage. #[cfg(feature = "opendal-s3")] #[serde(skip)] @@ -239,7 +237,7 @@ impl OpenDalResolvingStorage { } let storage = build_storage_for_scheme( - &scheme, + scheme, &self.props, #[cfg(feature = "opendal-s3")] &self.customized_credential_load, @@ -288,7 +286,7 @@ impl Storage for OpenDalResolvingStorage { async fn delete_stream(&self, mut paths: BoxStream<'static, String>) -> Result<()> { // Group paths by scheme so each resolved storage receives a batch, // avoiding repeated operator creation per path. - let mut grouped: HashMap> = HashMap::new(); + let mut grouped: HashMap> = HashMap::new(); while let Some(path) = paths.next().await { let scheme = extract_scheme(&path)?; grouped.entry(scheme).or_default().push(path); @@ -317,3 +315,54 @@ impl Storage for OpenDalResolvingStorage { )) } } + +#[cfg(test)] +mod tests { + use super::*; + + /// Builds a resolving storage with empty props, suitable for `resolve()` + /// calls that don't actually hit any backend. + fn empty_resolving_storage() -> OpenDalResolvingStorage { + OpenDalResolvingStorage { + props: HashMap::new(), + storages: RwLock::new(HashMap::new()), + #[cfg(feature = "opendal-s3")] + customized_credential_load: None, + } + } + + #[cfg(feature = "opendal-s3")] + #[test] + fn test_resolve_s3_aliases_share_instance() { + let storage = empty_resolving_storage(); + + // All three S3-family schemes must collapse to a single cached + // `Arc` so that catalogs handing the resolver a mix + // of `s3://`, `s3a://`, `s3n://` paths don't rebuild operators. + let a = storage.resolve("s3://bucket/key").unwrap(); + let b = storage.resolve("s3a://bucket/key").unwrap(); + let c = storage.resolve("s3n://bucket/key").unwrap(); + + assert!(Arc::ptr_eq(&a, &b), "s3 and s3a should share one instance"); + assert!(Arc::ptr_eq(&a, &c), "s3 and s3n should share one instance"); + } + + #[cfg(feature = "opendal-azdls")] + #[test] + fn test_resolve_azdls_aliases_share_instance() { + let storage = empty_resolving_storage(); + + let path_for = |scheme: &str| { + format!("{scheme}://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet") + }; + + // All Azure schemes collapse onto one cached instance. + let abfss = storage.resolve(&path_for("abfss")).unwrap(); + let abfs = storage.resolve(&path_for("abfs")).unwrap(); + + assert!( + Arc::ptr_eq(&abfss, &abfs), + "abfss and abfs should share one instance" + ); + } +} diff --git a/crates/storage/opendal/src/s3.rs b/crates/storage/opendal/src/s3.rs index 7db88d273f..2e21418606 100644 --- a/crates/storage/opendal/src/s3.rs +++ b/crates/storage/opendal/src/s3.rs @@ -37,6 +37,12 @@ use crate::utils::{from_opendal_error, is_truthy}; /// Parse iceberg props to s3 config. pub(crate) fn s3_config_parse(mut m: HashMap) -> Result { let mut cfg = S3Config::default(); + // Match Iceberg `S3FileIOProperties.PATH_STYLE_ACCESS_DEFAULT = false`: + // virtual-host-style addressing is the spec default. opendal's own + // default is path-style, which disagrees with the Java SDK and breaks + // S3-compatible stores that only accept virtual-hosted-style URLs. + // Any explicit `s3.path-style-access` property below overrides this. + cfg.enable_virtual_host_style = true; if let Some(endpoint) = m.remove(S3_ENDPOINT) { cfg.endpoint = Some(endpoint); }; @@ -177,3 +183,28 @@ impl AwsCredentialLoad for CustomAwsCredentialLoader { self.0.load_credential(client).await } } + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use iceberg::io::S3_PATH_STYLE_ACCESS; + + use super::s3_config_parse; + + fn parse_with(prop: Option<&str>) -> bool { + let mut props = HashMap::new(); + if let Some(v) = prop { + props.insert(S3_PATH_STYLE_ACCESS.to_string(), v.to_string()); + } + s3_config_parse(props).unwrap().enable_virtual_host_style + } + + #[test] + fn s3_config_parse_path_style_access() { + // Match Iceberg S3FileIOProperties.PATH_STYLE_ACCESS_DEFAULT = false. + assert!(parse_with(None)); + assert!(parse_with(Some("false"))); + assert!(!parse_with(Some("true"))); + } +} diff --git a/crates/storage/opendal/tests/file_io_s3_test.rs b/crates/storage/opendal/tests/file_io_s3_test.rs index 207a4454d7..d6dd8a3b45 100644 --- a/crates/storage/opendal/tests/file_io_s3_test.rs +++ b/crates/storage/opendal/tests/file_io_s3_test.rs @@ -26,7 +26,8 @@ mod tests { use async_trait::async_trait; use futures::StreamExt; use iceberg::io::{ - FileIO, FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY, + FileIO, FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, S3_REGION, + S3_SECRET_ACCESS_KEY, }; use iceberg_storage_opendal::{CustomAwsCredentialLoader, OpenDalStorageFactory}; use iceberg_test_utils::{get_minio_endpoint, normalize_test_name_with_parts, set_up}; @@ -39,7 +40,6 @@ mod tests { let minio_endpoint = get_minio_endpoint(); FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .with_props(vec![ @@ -47,6 +47,7 @@ mod tests { (S3_ACCESS_KEY_ID, "admin".to_string()), (S3_SECRET_ACCESS_KEY, "password".to_string()), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]) .build() } @@ -132,13 +133,13 @@ mod tests { // Test that the loader can be used in FileIOBuilder with OpenDalStorageFactory let _builder = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: Some(custom_loader), })) .with_props(vec![ (S3_ENDPOINT, "http://localhost:9000".to_string()), ("bucket", "test-bucket".to_string()), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]); } @@ -154,12 +155,12 @@ mod tests { // Build FileIO with custom credential loader via OpenDalStorageFactory let file_io_with_custom_creds = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: Some(custom_loader), })) .with_props(vec![ (S3_ENDPOINT, minio_endpoint), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]) .build(); @@ -182,12 +183,12 @@ mod tests { // Build FileIO with custom credential loader via OpenDalStorageFactory let file_io_with_custom_creds = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: Some(custom_loader), })) .with_props(vec![ (S3_ENDPOINT, minio_endpoint), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]) .build(); diff --git a/crates/storage/opendal/tests/resolving_storage_test.rs b/crates/storage/opendal/tests/resolving_storage_test.rs index 4572ad2c2d..c235089508 100644 --- a/crates/storage/opendal/tests/resolving_storage_test.rs +++ b/crates/storage/opendal/tests/resolving_storage_test.rs @@ -29,7 +29,8 @@ mod tests { use std::sync::Arc; use iceberg::io::{ - FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY, + FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, S3_REGION, + S3_SECRET_ACCESS_KEY, }; use iceberg_storage_opendal::OpenDalResolvingStorageFactory; use iceberg_test_utils::{get_minio_endpoint, normalize_test_name_with_parts, set_up}; @@ -45,6 +46,7 @@ mod tests { (S3_ACCESS_KEY_ID, "admin".to_string()), (S3_SECRET_ACCESS_KEY, "password".to_string()), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]) .build() } @@ -288,6 +290,7 @@ mod tests { .with_props(vec![ (S3_ENDPOINT, minio_endpoint), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]) .build();