diff --git a/.github/workflows/ci-fips.yml b/.github/workflows/ci-fips.yml index 480d67306..2ed0470d9 100644 --- a/.github/workflows/ci-fips.yml +++ b/.github/workflows/ci-fips.yml @@ -55,7 +55,7 @@ jobs: - name: Install build deps (Linux) if: runner.os == 'Linux' - run: sudo apt-get update && sudo apt-get install -y cmake nasm golang-go + run: sudo apt-get -o Acquire::Retries=3 update && sudo apt-get -o Acquire::Retries=3 install -y cmake nasm golang-go - name: Install build deps (macOS) if: runner.os == 'macOS' @@ -66,7 +66,7 @@ jobs: run: choco install nasm -y - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -87,6 +87,36 @@ jobs: - name: Test --no-default-features --features fips,icc run: cargo test --no-default-features --features fips,icc + # FIPS-safe example run (cross-OS): the core scenarios use no weak crypto, + # so they must work under the FIPS build. Demonstrates extraction / + # conversion / create / search / edit / forms on Linux/macOS/Windows. + # The FIPS *rejection* path (Standard Security R≤4 needs MD5+RC4 → refused; + # only AES-256/SHA-2 R=6 is permitted, ISO 32000-1 §7.6) is covered by the + # `cargo test --features fips,icc` step above (encryption handler unit tests). + - name: Run FIPS-safe Rust examples (cross-OS, with assertions) + shell: bash + run: | + set -uo pipefail + PDF=tests/fixtures/simple.pdf + out_dir="${RUNNER_TEMP:-/tmp}" + FEAT="--no-default-features --features fips,icc" + fail() { echo "ASSERT FAILED: $1"; exit 1; } + run() { # run -- + local expect="$1"; shift; [ "$1" = "--" ] && shift + echo "::group::$*" + local out; out="$("$@" 2>&1)" || { echo "$out"; fail "non-zero exit: $*"; } + echo "$out" + [ -n "$(printf '%s' "$out" | tr -d '[:space:]')" ] || fail "empty output: $*" + [ -z "$expect" ] || printf '%s' "$out" | grep -qF "$expect" || fail "missing '$expect': $*" + echo "::endgroup::" + } + run "Pages:" -- cargo run $FEAT --quiet --example tutorial_extract_text -- "$PDF" + run "" -- cargo run $FEAT --quiet --example tutorial_convert_formats -- "$PDF" + run "" -- cargo run $FEAT --quiet --example tutorial_create_pdf + run "" -- cargo run $FEAT --quiet --example tutorial_search_text -- "$PDF" "PDF" + run "" -- cargo run $FEAT --quiet --example tutorial_edit_document -- "$PDF" "$out_dir/fips_edited.pdf" + run "" -- cargo run $FEAT --quiet --example tutorial_forms_annotations -- "$PDF" + # ─── Java binding FIPS build (v0.3.53 #NNN). Validates the # `pdf_oxide_jni` cdylib compiles under --features fips and that # the Java surface still works against a FIPS-compiled native @@ -116,14 +146,14 @@ jobs: - name: Install build deps (Linux) if: runner.os == 'Linux' - run: sudo apt-get update && sudo apt-get install -y cmake nasm golang-go + run: sudo apt-get -o Acquire::Retries=3 update && sudo apt-get -o Acquire::Retries=3 install -y cmake nasm golang-go - name: Install build deps (macOS) if: runner.os == 'macOS' run: brew install cmake nasm go - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -232,7 +262,7 @@ jobs: - name: Install build deps (Linux) if: runner.os == 'Linux' - run: sudo apt-get update && sudo apt-get install -y cmake nasm golang-go + run: sudo apt-get -o Acquire::Retries=3 update && sudo apt-get -o Acquire::Retries=3 install -y cmake nasm golang-go - name: Install build deps (macOS) if: runner.os == 'macOS' @@ -251,7 +281,7 @@ jobs: uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 81092b10c..45ea3ebee 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,7 +33,7 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -54,7 +54,7 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -89,7 +89,7 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -135,7 +135,7 @@ jobs: uses: ./.github/actions/free-disk-space - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -199,7 +199,7 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -228,10 +228,10 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install build deps for aws-lc-rs - run: sudo apt-get update && sudo apt-get install -y cmake nasm + run: sudo apt-get -o Acquire::Retries=3 update && sudo apt-get -o Acquire::Retries=3 install -y cmake nasm - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -253,15 +253,20 @@ jobs: - name: Clippy under FIPS feature run: cargo clippy --no-default-features --features python,fips,icc --lib --tests -- -D warnings - # Rust examples smoke-test + # Rust examples — core scenarios run + assert output on Linux/macOS/Windows + # (#648 class: a Linux-only run would miss per-OS path/CRLF/loader regressions). rust-examples: name: Rust Examples - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -269,27 +274,52 @@ jobs: - name: Cache uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 with: - key: rust-examples + key: rust-examples-${{ matrix.os }} - name: Build examples (default features) run: cargo build --examples - - name: Build examples (signatures feature) - run: cargo build --examples --features signatures - - - name: Run Rust examples + # Core tutorial scenarios (01–07): run on every OS and assert real output, + # not just exit code, so a silent empty/garbage regression fails CI. + - name: Run core Rust examples (cross-OS, with assertions) + shell: bash + run: | + set -uo pipefail + PDF=tests/fixtures/simple.pdf + out_dir="${RUNNER_TEMP:-/tmp}" + fail() { echo "ASSERT FAILED: $1"; exit 1; } + run() { # run -- + local expect="$1"; shift; [ "$1" = "--" ] && shift + echo "::group::$*" + local out; out="$("$@" 2>&1)" || { echo "$out"; fail "non-zero exit: $*"; } + echo "$out" + [ -n "$(printf '%s' "$out" | tr -d '[:space:]')" ] || fail "empty output: $*" + if [ -n "$expect" ]; then printf '%s' "$out" | grep -qF "$expect" || fail "missing '$expect': $*"; fi + echo "::endgroup::" + } + run "Pages:" -- cargo run --quiet --example tutorial_extract_text -- "$PDF" + run "" -- cargo run --quiet --example tutorial_convert_formats -- "$PDF" + run "" -- cargo run --quiet --example tutorial_create_pdf + run "" -- cargo run --quiet --example tutorial_search_text -- "$PDF" "PDF" + run "" -- cargo run --quiet --example tutorial_extract_structured -- "$PDF" + run "" -- cargo run --quiet --example tutorial_edit_document -- "$PDF" "$out_dir/edited.pdf" + run "" -- cargo run --quiet --example tutorial_forms_annotations -- "$PDF" + + # Batch + feature showcases are OS-independent (pure-Rust core) — run once + # on Linux to bound CI minutes. + - name: Run showcase Rust examples (Linux only) + if: matrix.os == 'ubuntu-latest' shell: bash run: | + set -euo pipefail PDF=tests/fixtures/simple.pdf - cargo run --example tutorial_extract_text -- "$PDF" - cargo run --example tutorial_convert_formats -- "$PDF" - cargo run --example tutorial_create_pdf - cargo run --example tutorial_search_text -- "$PDF" "PDF" - cargo run --example tutorial_extract_structured -- "$PDF" - cargo run --example tutorial_edit_document -- "$PDF" /tmp/edited.pdf - cargo run --example tutorial_forms_annotations -- "$PDF" + cargo build --examples --features signatures cargo run --example tutorial_batch_processing -- "$PDF" "$PDF" cargo run --example showcase_barcode_svg --features barcodes + cargo run --example showcase_compliance_validation + cargo run --example showcase_dashed_stroke + cargo run --example showcase_encrypted_bytes + cargo run --example showcase_page_extraction cargo run --example showcase_streaming_table cargo run --example showcase_image_embedding cargo run --example showcase_pdf_ua_image @@ -310,7 +340,7 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -335,7 +365,7 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -357,7 +387,7 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -405,7 +435,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -465,6 +495,71 @@ jobs: cd "$GITHUB_WORKSPACE/examples/python/09-new-features/office_conversion" && uv run python main.py cd "$GITHUB_WORKSPACE/examples/python/09-new-features/ocr_scanned_pdf" && uv run python main.py + # Python examples — core scenarios run + assert output on Linux/macOS/Windows + # (the version-matrixed `python` job above runs the full suite on Linux only). + python-examples: + name: Python Examples (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: '3.12' + + - name: Install Rust + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 + with: + cache: false + rustflags: '' + + - name: Cache + uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + with: + key: python-examples-${{ matrix.os }} + + - name: Install uv + uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 + + - name: Build + install wheel + shell: bash + run: | + uv venv + uv pip install maturin + uv run maturin build --release --features python,barcodes --out dist + wheel=$(ls dist/*.whl | head -n1) + uv pip install "$wheel" + + # Core scenarios (01–07): run + assert output on every OS. + - name: Run Python examples (core, cross-OS, with assertions) + shell: bash + run: | + set -uo pipefail + PDF="$GITHUB_WORKSPACE/tests/fixtures/simple.pdf" + out_dir="${RUNNER_TEMP:-/tmp}" + fail() { echo "ASSERT FAILED: $1"; exit 1; } + run() { # run -- + local dir="$1"; shift; [ "$1" = "--" ] && shift + echo "::group::python $dir" + local out; out="$(cd "$GITHUB_WORKSPACE/examples/python/$dir" && uv run --project "$GITHUB_WORKSPACE" python main.py "$@" 2>&1)" \ + || { echo "$out"; fail "exit: $dir"; } + echo "$out" + [ -n "$(printf '%s' "$out" | tr -d '[:space:]')" ] || fail "empty output: $dir" + echo "::endgroup::" + } + run 01-extract-text -- "$PDF" + run 02-convert-formats -- "$PDF" + run 03-create-pdf -- + run 04-search-text -- "$PDF" "PDF" + run 05-extract-structured -- "$PDF" + run 06-edit-document -- "$PDF" "$out_dir/py_edited.pdf" + run 07-forms-annotations -- "$PDF" + # WASM build check wasm-build: name: WASM Build @@ -478,7 +573,7 @@ jobs: tool-cache: 'false' - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -494,6 +589,28 @@ jobs: - name: Clippy WASM run: cargo clippy --target wasm32-unknown-unknown --no-default-features --features wasm,rendering,barcodes --lib -- -D warnings + # Generate the Node.js bindings from the debug wasm build and run the + # WASM consumer example + the core functional test-parity suite. This is + # the WASM leg of the cross-language example/parity verification: it + # proves the wasm-bindgen surface actually loads and the same core + # behaviors hold as in every other binding. wasm-bindgen-cli is pinned to + # the exact `wasm-bindgen` crate version (matching release.yml). + - name: Install matching wasm-bindgen-cli + run: | + WASM_BINDGEN_VERSION=$(grep -A1 'name = "wasm-bindgen"' Cargo.lock | grep version | head -1 | sed 's/.*"\(.*\)"/\1/') + echo "Installing wasm-bindgen-cli@${WASM_BINDGEN_VERSION}" + cargo install wasm-bindgen-cli --version "$WASM_BINDGEN_VERSION" + + - name: WASM example + core-parity suite (Node consumer) + run: | + set -eux + wasm-bindgen --target nodejs \ + --out-dir examples/wasm_node/ \ + target/wasm32-unknown-unknown/debug/pdf_oxide.wasm + cd examples/wasm_node + node extract_text.mjs + node --test core_parity.test.mjs + # WASI build target (issue #214). Used by sandboxed agent / # serverless deployments via wasmtime / wasmer / Deno. We only # build to keep `cargo +nightly` out of the equation — runtime @@ -515,7 +632,7 @@ jobs: run: | set -eux ver="120" - curl -fL -o binaryen.tar.gz "https://github.com/WebAssembly/binaryen/releases/download/version_${ver}/binaryen-version_${ver}-x86_64-linux.tar.gz" + curl --retry 3 --retry-all-errors -fL -o binaryen.tar.gz "https://github.com/WebAssembly/binaryen/releases/download/version_${ver}/binaryen-version_${ver}-x86_64-linux.tar.gz" tar -xzf binaryen.tar.gz echo "$PWD/binaryen-version_${ver}/bin" >> "$GITHUB_PATH" @@ -626,18 +743,27 @@ jobs: # Java JNI shim crate (`pdf_oxide_jni`) — cdylib that exports # `Java_fyi_oxide_pdf_*` symbols. Built with the same # extended feature set so Java tests can exercise rendering / - # signatures / TSA. Linux only in PR CI; release.yml fans - # out to the five JAR-bundled arches (linux x86_64/aarch64, - # macOS x86_64/aarch64, windows x86_64). + # signatures / TSA. Built on all three host OSes so the Java + # binding's JNI surface is exercised cross-OS in PR CI (#458 + # follow-up: previously Linux-only). release.yml still fans out + # to the five JAR-bundled arches for publishing. - os: ubuntu-latest variant: java-jni features: "rendering,signatures,tsa-client" extra-target: "" + - os: macos-latest + variant: java-jni + features: "rendering,signatures,tsa-client" + extra-target: "" + - os: windows-latest + variant: java-jni + features: "rendering,signatures,tsa-client" + extra-target: "" steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -772,22 +898,40 @@ jobs: working-directory: go run: go test -race -count=1 -p 1 -tags pdf_oxide_dev ./... - - name: Run Go examples - if: matrix.os == 'ubuntu-latest' + # Core scenarios (01–07): run + assert output on Linux/macOS/Windows. + - name: Run Go examples (core, cross-OS, with assertions) shell: bash run: | + set -uo pipefail PDF="$GITHUB_WORKSPACE/tests/fixtures/simple.pdf" - for dir in 01-extract-text 02-convert-formats \ - 05-extract-structured 07-forms-annotations 08-batch-processing; do + out_dir="${RUNNER_TEMP:-/tmp}" + fail() { echo "ASSERT FAILED: $1"; exit 1; } + run() { # run -- + local dir="$1"; shift; [ "$1" = "--" ] && shift cd "$GITHUB_WORKSPACE/examples/go/$dir" - go run -tags pdf_oxide_dev main.go "$PDF" - done - cd "$GITHUB_WORKSPACE/examples/go/04-search-text" - go run -tags pdf_oxide_dev main.go "$PDF" "PDF" - cd "$GITHUB_WORKSPACE/examples/go/03-create-pdf" - go run -tags pdf_oxide_dev main.go - cd "$GITHUB_WORKSPACE/examples/go/06-edit-document" - go run -tags pdf_oxide_dev main.go "$PDF" /tmp/go_edited.pdf + echo "::group::go $dir" + local out; out="$(go run -tags pdf_oxide_dev main.go "$@" 2>&1)" || { echo "$out"; fail "exit: $dir"; } + echo "$out" + [ -n "$(printf '%s' "$out" | tr -d '[:space:]')" ] || fail "empty output: $dir" + echo "::endgroup::" + } + run 01-extract-text -- "$PDF" + run 02-convert-formats -- "$PDF" + run 03-create-pdf -- + run 04-search-text -- "$PDF" "PDF" + run 05-extract-structured -- "$PDF" + run 06-edit-document -- "$PDF" "$out_dir/go_edited.pdf" + run 07-forms-annotations -- "$PDF" + + # Batch + feature showcases are OS-independent — run once on Linux. + - name: Run Go showcase examples (Linux only) + if: matrix.os == 'ubuntu-latest' + shell: bash + run: | + set -euo pipefail + PDF="$GITHUB_WORKSPACE/tests/fixtures/simple.pdf" + cd "$GITHUB_WORKSPACE/examples/go/08-batch-processing" + go run -tags pdf_oxide_dev main.go "$PDF" "$PDF" for feat in barcode_svg streaming_table pdf_ua_image in_memory_roundtrip pkcs12_signing rfc3161_timestamp dashed_stroke encrypted_bytes page_extraction pdfa_conversion compliance_validation image_embedding office_conversion; do cd "$GITHUB_WORKSPACE/examples/go/09-new-features/$feat" go run -tags pdf_oxide_dev main.go @@ -842,14 +986,17 @@ jobs: working-directory: js run: npx tsc --noEmit - # Compile TypeScript to lib/ — publint / arethetypeswrong need - # the actual .js + .d.ts files that package.json's `main` / - # `types` / `exports` entries point at. `--noEmit` above only - # catches type errors, it doesn't write the output tree. - - name: TypeScript compile to lib/ - if: matrix.os == 'ubuntu-latest' + # Build lib/ on every OS via the canonical build (tsc + fix-esm-imports). + # `lib/*.js` is gitignored and generated, and the test/example steps below + # import the package through it, so it must exist on macOS/Windows too — + # not just Linux. Bare `tsc` is NOT enough: the source uses extensionless + # relative imports, and `fix-esm-imports.js` rewrites them to explicit + # `.js` paths that Node's ESM resolver requires at runtime. publint / + # arethetypeswrong also need these real .js + .d.ts files that + # package.json's `main` / `types` / `exports` entries point at. + - name: Build lib/ (tsc + ESM import fixups) working-directory: js - run: npx tsc + run: npm run build:ts # Biome v2 replaces eslint + prettier with a single Rust-core tool. - name: Biome lint+format check @@ -922,51 +1069,87 @@ jobs: npx node-gyp rebuild ls -la build/Release/pdf_oxide.node - - name: Run Node.js tests - working-directory: js - run: npm test - - - name: Stage node-gyp build as prebuilt for examples - if: matrix.os == 'ubuntu-latest' + # Stage the freshly-built addon into the per-OS prebuilds dir the package + # resolves at runtime (linux-x64 / darwin-arm64 / win32-x64). This must + # run BEFORE the tests below: the compiled lib/ loads the native addon + # from prebuilds/, so the lib-backed tests (core-parity, readme-quickstart) + # can only load it once it's staged here — on every OS, not just Linux. + - name: Stage node-gyp build as prebuilt working-directory: js shell: bash run: | - mkdir -p prebuilds/linux-x64 - cp build/Release/pdf_oxide.node prebuilds/linux-x64/pdf_oxide.node + dir="prebuilds/$(node -e "process.stdout.write(process.platform + '-' + process.arch)")" + mkdir -p "$dir" + cp build/Release/pdf_oxide.node "$dir/pdf_oxide.node" + ls -la "$dir" + + - name: Run Node.js tests + working-directory: js + run: npm test - name: Install examples workspace dependencies - if: matrix.os == 'ubuntu-latest' working-directory: examples/javascript run: npm install --ignore-scripts - - name: Run JavaScript examples + # Core scenarios (01–07): run + assert output on Linux/macOS/Windows. This + # is the direct #648 guard (a Windows-Node ESM/loader regression must fail + # CI). No `timeout` — macOS/Windows git-bash lack it; the job timeout covers + # hangs. + - name: Run JavaScript examples (core, cross-OS, with assertions) + shell: bash + env: + NODE_PATH: "${{ github.workspace }}/examples/javascript/node_modules" + run: | + set -uo pipefail + PDF="$GITHUB_WORKSPACE/tests/fixtures/simple.pdf" + cd "$GITHUB_WORKSPACE/examples/javascript" + fail() { echo "ASSERT FAILED: $1"; exit 1; } + run() { # run -- node script args... + local expect="$1"; shift; [ "$1" = "--" ] && shift + echo "::group::$*" + local out; out="$("$@" 2>&1)" || { echo "$out"; fail "non-zero exit: $*"; } + echo "$out" + [ -n "$(printf '%s' "$out" | tr -d '[:space:]')" ] || fail "empty output: $*" + [ -z "$expect" ] || printf '%s' "$out" | grep -qF "$expect" || fail "missing '$expect': $*" + echo "::endgroup::" + } + run "" -- node 01-extract-text/index.js "$PDF" + run "" -- node 02-convert-formats/index.js "$PDF" + run "" -- node 03-create-pdf/index.js + run "" -- node 04-search-text/index.js "$PDF" "PDF" + run "" -- node 05-extract-structured/index.js "$PDF" + # 06 deletes a page, so it needs a multi-page input (simple.pdf is 1 page). + run "Deleted" -- node 06-edit-document/index.js "$GITHUB_WORKSPACE/tests/fixtures/hello_structure.pdf" "${TMPDIR:-/tmp}/js_edited.pdf" + run "" -- node 07-forms-annotations/index.js "$PDF" + + # Batch + feature showcases are OS-independent — run once on Linux. + - name: Run JavaScript showcase examples (Linux only) if: matrix.os == 'ubuntu-latest' shell: bash env: NODE_PATH: "${{ github.workspace }}/examples/javascript/node_modules" run: | + set -euo pipefail PDF="$GITHUB_WORKSPACE/tests/fixtures/simple.pdf" cd "$GITHUB_WORKSPACE/examples/javascript" - echo "--- 01-extract-text ---" && timeout 120 node 01-extract-text/index.js "$PDF" - echo "--- 02-convert-formats ---" && timeout 120 node 02-convert-formats/index.js "$PDF" - echo "--- 03-create-pdf ---" && timeout 120 node 03-create-pdf/index.js - echo "--- 04-search-text ---" && timeout 120 node 04-search-text/index.js "$PDF" "PDF" - echo "--- 05-extract-structured ---" && timeout 120 node 05-extract-structured/index.js "$PDF" - echo "--- 07-forms-annotations ---" && timeout 120 node 07-forms-annotations/index.js "$PDF" - echo "--- 08-batch-processing ---" && timeout 120 node 08-batch-processing/index.js "$PDF" "$PDF" - echo "--- barcode_svg ---" && timeout 120 node 09-new-features/barcode_svg/index.js - echo "--- streaming_table ---" && timeout 120 node 09-new-features/streaming_table/index.js - echo "--- pdf_ua_image ---" && timeout 120 node 09-new-features/pdf_ua_image/index.js - echo "--- in_memory_roundtrip ---" && timeout 120 node 09-new-features/in_memory_roundtrip/index.js - echo "--- pkcs12_signing ---" && timeout 120 node 09-new-features/pkcs12_signing/index.js - echo "--- rfc3161_timestamp ---" && timeout 120 node 09-new-features/rfc3161_timestamp/index.js - echo "--- dashed_stroke ---" && timeout 120 node 09-new-features/dashed_stroke/index.js - echo "--- encrypted_bytes ---" && timeout 120 node 09-new-features/encrypted_bytes/index.js - echo "--- page_extraction ---" && timeout 120 node 09-new-features/page_extraction/index.js - echo "--- compliance_validation ---" && timeout 120 node 09-new-features/compliance_validation/index.js - echo "--- pdfa_conversion ---" && timeout 120 node 09-new-features/pdfa_conversion/index.js - echo "--- image_embedding ---" && timeout 120 node 09-new-features/image_embedding/index.js - echo "--- office_conversion ---" && timeout 120 node 09-new-features/office_conversion/index.js + for ex in \ + "08-batch-processing/index.js $PDF $PDF" \ + 09-new-features/barcode_svg/index.js \ + 09-new-features/streaming_table/index.js \ + 09-new-features/pdf_ua_image/index.js \ + 09-new-features/in_memory_roundtrip/index.js \ + 09-new-features/pkcs12_signing/index.js \ + 09-new-features/rfc3161_timestamp/index.js \ + 09-new-features/dashed_stroke/index.js \ + 09-new-features/encrypted_bytes/index.js \ + 09-new-features/page_extraction/index.js \ + 09-new-features/compliance_validation/index.js \ + 09-new-features/pdfa_conversion/index.js \ + 09-new-features/image_embedding/index.js \ + 09-new-features/office_conversion/index.js; do + echo "--- $ex ---" + timeout 120 node $ex + done - name: Compile TypeScript examples if: matrix.os == 'ubuntu-latest' @@ -1062,23 +1245,45 @@ jobs: working-directory: csharp/PdfOxide.Tests run: dotnet test -c Release --no-build --verbosity normal - - name: Run C# examples + # Core scenarios (01–07): run + assert output on Linux/macOS/Windows. The + # native lib is resolved per-OS (LD/DYLD_LIBRARY_PATH on Linux/macOS, PATH + # on Windows). + - name: Run C# examples (core, cross-OS, with assertions) + shell: bash + env: + LD_LIBRARY_PATH: "${{ github.workspace }}/target/release" + DYLD_LIBRARY_PATH: "${{ github.workspace }}/target/release" + run: | + set -uo pipefail + PDF="$GITHUB_WORKSPACE/tests/fixtures/simple.pdf" + out_dir="${RUNNER_TEMP:-/tmp}" + export PATH="$GITHUB_WORKSPACE/target/release:$PATH" # Windows dll search + fail() { echo "ASSERT FAILED: $1"; exit 1; } + run() { # run -- + local proj="$1"; shift; [ "$1" = "--" ] && shift + echo "::group::$proj" + local out; out="$(dotnet run --project "$proj" -- "$@" 2>&1)" || { echo "$out"; fail "exit: $proj"; } + echo "$out" + [ -n "$(printf '%s' "$out" | tr -d '[:space:]')" ] || fail "empty output: $proj" + echo "::endgroup::" + } + run examples/csharp/01-extract-text/ExtractText.csproj -- "$PDF" + run examples/csharp/02-convert-formats/ConvertFormats.csproj -- "$PDF" + run examples/csharp/03-create-pdf/CreatePdf.csproj -- + run examples/csharp/04-search-text/SearchText.csproj -- "$PDF" "PDF" + run examples/csharp/05-extract-structured/ExtractStructured.csproj -- "$PDF" + run examples/csharp/06-edit-document/EditDocument.csproj -- "$PDF" "$out_dir/cs_edited.pdf" + run examples/csharp/07-forms-annotations/FormsAnnotations.csproj -- "$PDF" + + # Batch + feature showcases are OS-independent — run once on Linux. + - name: Run C# showcase examples (Linux only) if: matrix.os == 'ubuntu-latest' shell: bash env: LD_LIBRARY_PATH: "${{ github.workspace }}/target/release" run: | + set -euo pipefail PDF="$GITHUB_WORKSPACE/tests/fixtures/simple.pdf" - for proj in \ - examples/csharp/01-extract-text/ExtractText.csproj \ - examples/csharp/02-convert-formats/ConvertFormats.csproj \ - examples/csharp/05-extract-structured/ExtractStructured.csproj \ - examples/csharp/07-forms-annotations/FormsAnnotations.csproj; do - dotnet run --project "$proj" -- "$PDF" - done - dotnet run --project examples/csharp/04-search-text/SearchText.csproj -- "$PDF" "PDF" - dotnet run --project examples/csharp/03-create-pdf/CreatePdf.csproj - dotnet run --project examples/csharp/06-edit-document/EditDocument.csproj -- "$PDF" /tmp/cs_edited.pdf dotnet run --project examples/csharp/08-batch-processing/BatchProcessing.csproj -- "$PDF" "$PDF" for proj in \ examples/csharp/09-new-features/BarcodeSvg/BarcodeSvg.csproj \ @@ -1147,6 +1352,9 @@ jobs: strategy: fail-fast: false matrix: + # Linux exercises the full JDK range (11/17/21); macOS + Windows add + # one JDK each so the JNI surface is verified on every host OS without + # tripling the JDK matrix on every platform. include: - os: ubuntu-latest jdk: '11' @@ -1154,6 +1362,10 @@ jobs: jdk: '17' - os: ubuntu-latest jdk: '21' + - os: macos-latest + jdk: '21' + - os: windows-latest + jdk: '21' steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 @@ -1174,18 +1386,32 @@ jobs: - name: Download Java JNI native lib artifact uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: - name: native-lib-ubuntu-latest-java-jni + name: native-lib-${{ matrix.os }}-java-jni - name: Stage native lib into Maven resources shell: bash run: | # The Java NativeLoader resolves - # /fyi/oxide/pdf/native/{OS}/{ARCH}/ from the JAR. - # Stage the cdylib at that path before mvn package so it - # gets embedded into the published JAR. - DEST="java/src/main/resources/fyi/oxide/pdf/native/Linux/x86_64" + # /fyi/oxide/pdf/native/{OS}/{ARCH}/ from the JAR + # (see internal/NativeLoader.java). Stage the freshly-built JNI + # cdylib at the path matching THIS runner's OS/arch before mvn + # package so the embedded-native + JNI tests run cross-OS. + case "${{ runner.os }}" in + Linux) OSDIR=Linux; LIB=libpdf_oxide_jni.so ;; + macOS) OSDIR=Mac; LIB=libpdf_oxide_jni.dylib ;; + Windows) OSDIR=Windows; LIB=pdf_oxide_jni.dll ;; + *) echo "::error::unsupported runner.os ${{ runner.os }}"; exit 1 ;; + esac + # GitHub runners: ubuntu/windows are x86_64; macos-latest is arm64. + case "${{ runner.arch }}" in + X64) ARCHDIR=x86_64 ;; + ARM64) ARCHDIR=aarch64 ;; + *) echo "::error::unsupported runner.arch ${{ runner.arch }}"; exit 1 ;; + esac + DEST="java/src/main/resources/fyi/oxide/pdf/native/$OSDIR/$ARCHDIR" mkdir -p "$DEST" - cp target/release/libpdf_oxide_jni.so "$DEST/libpdf_oxide_jni.so" + # The artifact preserves the original target/release/ layout. + cp "target/release/$LIB" "$DEST/$LIB" ls -la "$DEST" - name: mvn compile @@ -1211,14 +1437,54 @@ jobs: shopt -u nullglob [ "${#jars[@]}" -eq 1 ] || { echo "::error::expected exactly one target/pdf-oxide-*.jar, found: ${jars[*]:-}"; exit 1; } JAR="${jars[0]}" - jar tf "$JAR" | grep -q "fyi/oxide/pdf/native/Linux/x86_64/libpdf_oxide_jni.so" \ - || { echo "::error::Native lib missing from JAR"; exit 1; } + # The embedded native path matches THIS runner's OS/arch (the JAR + # built on each host bundles only that host's native; release.yml + # builds the all-arch fat JAR). + case "${{ runner.os }}" in + Linux) NATIVE=fyi/oxide/pdf/native/Linux/x86_64/libpdf_oxide_jni.so ;; + macOS) NATIVE=fyi/oxide/pdf/native/Mac/aarch64/libpdf_oxide_jni.dylib ;; + Windows) NATIVE=fyi/oxide/pdf/native/Windows/x86_64/pdf_oxide_jni.dll ;; + esac + jar tf "$JAR" | grep -q "$NATIVE" \ + || { echo "::error::Native lib ($NATIVE) missing from JAR"; exit 1; } unzip -p "$JAR" META-INF/MANIFEST.MF | grep -q "Automatic-Module-Name: fyi.oxide.pdf" \ || { echo "::error::Manifest missing Automatic-Module-Name"; exit 1; } - echo "::notice::JAR validated: $JAR ($(stat -c%s "$JAR") bytes)" + # Portable file size: GNU stat (-c%s) on Linux, BSD stat (-f%z) on macOS. + SIZE=$(stat -c%s "$JAR" 2>/dev/null || stat -f%z "$JAR") + echo "::notice::JAR validated: $JAR ($SIZE bytes, native $NATIVE)" + + # Core example (01-extract-text): compile + run against the built JAR + # (which self-loads its embedded JNI native lib) and assert output. + # Runs once per host OS (Linux's JDK 11 row + the single macOS/Windows + # rows) so the JNI load path is exercised on Linux, macOS, and Windows. + - name: Run Java example (01-extract-text, with assertion) + if: matrix.jdk == '11' || matrix.os != 'ubuntu-latest' + shell: bash + working-directory: java + run: | + set -uo pipefail + # Classpath separator: ':' on Unix, ';' on Windows (the JDK there is + # native Windows even under git-bash). + case "${{ runner.os }}" in Windows) SEP=';' ;; *) SEP=':' ;; esac + JAR=$(ls target/pdf-oxide-*.jar | head -n1) + PDF="$GITHUB_WORKSPACE/tests/fixtures/simple.pdf" + # Runtime deps (slf4j-api, org.json) are declared compile-scope and + # reach real Maven/Gradle consumers transitively, but the bare JAR + # does not bundle them. Resolve the runtime classpath the same way a + # consumer would so the smoke test exercises the JAR as published. + mvn -B -P!dev -q dependency:build-classpath \ + -DincludeScope=runtime -Dmdep.outputFile="$RUNNER_TEMP/cp.txt" + DEPS=$(cat "$RUNNER_TEMP/cp.txt") + mkdir -p "$RUNNER_TEMP/javaex" + javac -cp "$JAR" -d "$RUNNER_TEMP/javaex" \ + "$GITHUB_WORKSPACE/examples/java/01-extract-text/ExtractText.java" + out="$(java -cp "$JAR$SEP$RUNNER_TEMP/javaex$SEP$DEPS" ExtractText "$PDF" 2>&1)" \ + || { echo "$out"; echo "ASSERT FAILED (exit)"; exit 1; } + echo "$out" + echo "$out" | grep -q "Pages:" || { echo "ASSERT FAILED: missing 'Pages:'"; exit 1; } - name: Upload Java JAR artifact - if: matrix.jdk == '11' + if: matrix.jdk == '11' || matrix.os != 'ubuntu-latest' uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: java-jar-${{ matrix.os }} @@ -1278,7 +1544,7 @@ jobs: uses: ./.github/actions/free-disk-space - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -1326,7 +1592,7 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -1365,7 +1631,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: actions-rust-lang/setup-rust-toolchain@v1 + - uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -1385,7 +1651,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: actions-rust-lang/setup-rust-toolchain@v1 + - uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -1405,7 +1671,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: actions-rust-lang/setup-rust-toolchain@v1 + - uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -1432,7 +1698,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: actions-rust-lang/setup-rust-toolchain@v1 + - uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -1490,7 +1756,7 @@ jobs: run: | v=$(grep -E '^rust-version' Cargo.toml | head -1 | sed 's/.*"\(.*\)".*/\1/') echo "version=${v:-1.82}" >> "$GITHUB_OUTPUT" - - uses: actions-rust-lang/setup-rust-toolchain@v1 + - uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 46a1aa65b..7bb12a7c6 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -31,11 +31,18 @@ jobs: uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' + # #544 (Proposal 3): cache the cargo registry + target dir so the CodeQL + # `cargo build` reuses prior compilation. This was the only Rust-building + # workflow with no caching at all; the others already cache via + # actions/cache, so rust-cache is added here only. + - name: Cache + uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + - name: Initialize CodeQL uses: github/codeql-action/init@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v3 with: diff --git a/.github/workflows/outdated.yml b/.github/workflows/outdated.yml index a1d27b70a..8ef94d988 100644 --- a/.github/workflows/outdated.yml +++ b/.github/workflows/outdated.yml @@ -16,7 +16,7 @@ jobs: steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: actions-rust-lang/setup-rust-toolchain@v1 + - uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -26,7 +26,7 @@ jobs: key: outdated - name: Install libgit2 (required by cargo-outdated) - run: sudo apt-get install -y libgit2-dev pkg-config + run: sudo apt-get -o Acquire::Retries=3 install -y libgit2-dev pkg-config - name: Install cargo-outdated run: cargo install --locked cargo-outdated diff --git a/.github/workflows/php.yml b/.github/workflows/php.yml index 49f77db6d..4b79590ec 100644 --- a/.github/workflows/php.yml +++ b/.github/workflows/php.yml @@ -59,7 +59,7 @@ jobs: tools: composer:v2 - name: Set up Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -149,6 +149,20 @@ jobs: PDF_OXIDE_CDYLIB_PATH: ${{ env.PDF_OXIDE_CDYLIB_PATH }} run: vendor/bin/phpunit --testsuite=Integration + # Core example (01-extract-text) runs on Linux/macOS/Windows with an + # output assertion — the cross-language cross-OS guard. + - name: Run PHP example (01-extract-text, with assertion) + shell: bash + env: + PDF_OXIDE_CDYLIB_PATH: ${{ env.PDF_OXIDE_CDYLIB_PATH }} + run: | + set -uo pipefail + PDF="$GITHUB_WORKSPACE/tests/fixtures/simple.pdf" + out="$(php "$GITHUB_WORKSPACE/examples/php/01-extract-text/main.php" "$PDF" 2>&1)" \ + || { echo "$out"; echo "ASSERT FAILED (exit)"; exit 1; } + echo "$out" + echo "$out" | grep -q "Pages:" || { echo "ASSERT FAILED: missing 'Pages:'"; exit 1; } + - name: Upload test artifacts on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index bdfee7780..aa45ec916 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -42,7 +42,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Set up Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -120,7 +120,7 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Set up Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -336,8 +336,8 @@ jobs: dnf install -y python3-pip > /dev/null 2>&1 ;; ubuntu|debian) - apt-get update -qq - DEBIAN_FRONTEND=noninteractive apt-get install -qq -y python3-pip python3-venv > /dev/null 2>&1 + apt-get -o Acquire::Retries=3 update -qq + DEBIAN_FRONTEND=noninteractive apt-get -o Acquire::Retries=3 install -qq -y python3-pip python3-venv > /dev/null 2>&1 ;; alpine) apk add --no-cache python3 py3-pip > /dev/null 2>&1 @@ -390,7 +390,7 @@ jobs: python-version: '3.11' - name: Set up Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' diff --git a/.github/workflows/release-fips.yml b/.github/workflows/release-fips.yml index 6041064e5..6d8c8fce7 100644 --- a/.github/workflows/release-fips.yml +++ b/.github/workflows/release-fips.yml @@ -165,7 +165,7 @@ jobs: - name: Install build deps (Linux) if: runner.os == 'Linux' - run: sudo apt-get update && sudo apt-get install -y cmake nasm golang-go + run: sudo apt-get -o Acquire::Retries=3 update && sudo apt-get -o Acquire::Retries=3 install -y cmake nasm golang-go - name: Install build deps (macOS) if: runner.os == 'macOS' @@ -186,7 +186,7 @@ jobs: uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -365,7 +365,7 @@ jobs: - name: Install build deps (Linux) if: runner.os == 'Linux' - run: sudo apt-get update && sudo apt-get install -y cmake nasm golang-go + run: sudo apt-get -o Acquire::Retries=3 update && sudo apt-get -o Acquire::Retries=3 install -y cmake nasm golang-go - name: Install build deps (macOS) if: runner.os == 'macOS' @@ -376,7 +376,7 @@ jobs: run: choco install nasm -y - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -618,7 +618,7 @@ jobs: - name: Install build deps (Linux) if: runner.os == 'Linux' - run: sudo apt-get update && sudo apt-get install -y cmake nasm golang-go + run: sudo apt-get -o Acquire::Retries=3 update && sudo apt-get -o Acquire::Retries=3 install -y cmake nasm golang-go - name: Install build deps (macOS) if: runner.os == 'macOS' @@ -629,7 +629,7 @@ jobs: run: choco install nasm -y - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -788,7 +788,7 @@ jobs: - name: Install build deps (Linux) if: runner.os == 'Linux' - run: sudo apt-get update && sudo apt-get install -y cmake nasm golang-go + run: sudo apt-get -o Acquire::Retries=3 update && sudo apt-get -o Acquire::Retries=3 install -y cmake nasm golang-go - name: Install build deps (macOS) if: runner.os == 'macOS' @@ -799,7 +799,7 @@ jobs: run: choco install nasm -y - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b4bae2c57..2ac1aa089 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -179,7 +179,7 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -199,13 +199,13 @@ jobs: - name: Install musl tools (Linux musl) if: matrix.target == 'x86_64-unknown-linux-musl' - run: sudo apt-get update && sudo apt-get install -y musl-tools + run: sudo apt-get -o Acquire::Retries=3 update && sudo apt-get -o Acquire::Retries=3 install -y musl-tools - name: Install cross-compilation tools (Linux ARM64) if: matrix.target == 'aarch64-unknown-linux-gnu' run: | - sudo apt-get update - sudo apt-get install -y gcc-aarch64-linux-gnu + sudo apt-get -o Acquire::Retries=3 update + sudo apt-get -o Acquire::Retries=3 install -y gcc-aarch64-linux-gnu - name: Build binaries shell: bash @@ -263,7 +263,7 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -379,7 +379,7 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -396,14 +396,14 @@ jobs: - name: Install cross-compilation tools (Linux ARM64) if: contains(matrix.target, 'aarch64-unknown-linux') run: | - sudo apt-get update - sudo apt-get install -y gcc-aarch64-linux-gnu + sudo apt-get -o Acquire::Retries=3 update + sudo apt-get -o Acquire::Retries=3 install -y gcc-aarch64-linux-gnu - name: Install cross-compilation tools (Windows GNU) if: matrix.target == 'x86_64-pc-windows-gnu' run: | - sudo apt-get update - sudo apt-get install -y gcc-mingw-w64-x86-64 + sudo apt-get -o Acquire::Retries=3 update + sudo apt-get -o Acquire::Retries=3 install -y gcc-mingw-w64-x86-64 - name: Build cdylib + staticlib shell: bash @@ -529,7 +529,7 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -546,8 +546,8 @@ jobs: - name: Install cross-compilation tools (Linux ARM64) if: contains(matrix.target, 'aarch64-unknown-linux') run: | - sudo apt-get update - sudo apt-get install -y gcc-aarch64-linux-gnu + sudo apt-get -o Acquire::Retries=3 update + sudo apt-get -o Acquire::Retries=3 install -y gcc-aarch64-linux-gnu - name: Build pdf_oxide_jni cdylib shell: bash @@ -1318,7 +1318,7 @@ jobs: python-version: '3.11' - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -1333,8 +1333,8 @@ jobs: - name: Install Linux cross-compilation tools if: matrix.target == 'aarch64-unknown-linux-gnu' run: | - sudo apt-get update - sudo apt-get install -y gcc-aarch64-linux-gnu + sudo apt-get -o Acquire::Retries=3 update + sudo apt-get -o Acquire::Retries=3 install -y gcc-aarch64-linux-gnu - name: Download type stubs uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 @@ -1528,7 +1528,7 @@ jobs: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -1872,7 +1872,7 @@ jobs: bundler-cache: true - name: Set up Rust (target ${{ matrix.cargo_target }}) - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index f9a5be05f..ba0af1bf8 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -240,7 +240,7 @@ jobs: bundler-cache: true - name: Set up Rust (target ${{ matrix.cargo_target }}) - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 # v1 with: cache: false rustflags: '' @@ -319,6 +319,22 @@ jobs: COVERAGE_LCOV: ${{ matrix.coverage && '1' || '' }} run: bundle exec rspec spec/ + # Core example (01-extract-text) runs on every native-arch cell + # (Linux/macOS/Windows) with an output assertion — the cross-language + # cross-OS guard. + - name: Run Ruby example (01-extract-text, with assertion) + if: ${{ ! matrix.cross_compiled }} + working-directory: ruby + shell: bash + run: | + set -uo pipefail + PDF="$GITHUB_WORKSPACE/tests/fixtures/simple.pdf" + out="$(bundle exec ruby -rbundler/setup -Ilib \ + "$GITHUB_WORKSPACE/examples/ruby/01-extract-text/main.rb" "$PDF" 2>&1)" \ + || { echo "$out"; echo "ASSERT FAILED (exit)"; exit 1; } + echo "$out" + echo "$out" | grep -q "Pages:" || { echo "ASSERT FAILED: missing 'Pages:'"; exit 1; } + - name: Upload coverage to Codecov if: matrix.coverage == true uses: codecov/codecov-action@aa56896cf108bd10b5eb883cd1d24196da57f695 # v5.5.4 diff --git a/CHANGELOG.md b/CHANGELOG.md index 0af67d061..24f52380a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,14 +2,19 @@ All notable changes to PDFOxide are documented here. -## [Unreleased] +## [0.3.61] - 2026-06-07 -> Press-accurate CMYK→RGB on the composite render path via the document `/OutputIntents` ICC profile; vertical writing mode (WMode 1 / tategaki) support across extraction, rendering, and reading-order pipelines +> Press-accurate CMYK→RGB rendering via document `/OutputIntents` ICC profiles, vertical writing mode (WMode 1 / tategaki) support, RTL (Hebrew/Arabic) and Indic text-extraction fixes, separation-plate image rendering and ActualText extraction, path flattening (`PathContent::to_points`), Node.js quickstart and form-display fixes, macOS OCR detection, faster table-heavy extraction, and cross-OS + cross-language CI verification ### Added -- **Vertical writing mode (WMode 1 / tategaki) support across extraction, rendering, and reading-order pipelines** — Japanese tategaki, Traditional Chinese vertical packaging, and similar `-V`-suffixed encodings (Identity-V, UniJIS-UTF16-V, UniGB-UTF16-V, UniCNS-UTF16-V, UniKS-UTF16-V) plus CMap streams with `/WMode 1 def` now drive vertical glyph advance along the y-axis instead of being silently rendered as horizontal. The §9.4.4 axis-swap math lives in a single helper (`GraphicsState::advance_text_matrix`) consumed by the extractor, page renderer, separation renderer, and text rasterizer — horizontal text pays one predicted-not-taken branch per advance. Per-CID `/W2` (§9.7.4.3) and `/DW2` arrays are parsed for vertical metrics; ToUnicode `/WMode` is intentionally ignored per §9.10.2 so a stale tooling leftover can't flip the document. Vertical-majority pages (≥50% of spans tagged `wmode == 1`) bypass the configured `ReadingOrderStrategyType` and route through a dedicated right-to-left column-ordering path, since none of the horizontal strategies can produce correct vertical reading order. -- **Press-accurate CMYK→RGB via document `/OutputIntents` ICC profile** — the composite render path now consumes the document's `/OutputIntents` CMYK `DestOutputProfile` and routes `/DeviceCMYK` paint, `/Separation` / `/DeviceN` colourants resolving to a `/DeviceCMYK` alternate, and `/ICCBased N=4` spaces lacking a usable embedded profile through `qcms` (ISO 32000-1:2008 §14.11.5, §10). The conversion is built as `qcms::Transform::new_to(src = OutputIntent, dst = sRGB)`, so it uses the OutputIntent profile's AToB ("device-to-PCS") direction into the CIE PCS and then the sRGB profile's PCS-to-device direction out — composite direction CMYK → CIE PCS → sRGB. Closes the press-vs-screen colour divergence on heavy-yellow / saturated-mid-tone branding artwork that previously rendered through the §10.3.5 additive-clamp fallback. When no `/OutputIntents` is declared, §10.3.5 is preserved byte-for-byte. +- **Vertical writing mode (WMode 1 / tategaki) support across extraction, rendering, and reading-order pipelines (#645)** — Japanese tategaki, Traditional Chinese vertical packaging, and similar `-V`-suffixed encodings (Identity-V, UniJIS-UTF16-V, UniGB-UTF16-V, UniCNS-UTF16-V, UniKS-UTF16-V) plus CMap streams with `/WMode 1 def` now drive vertical glyph advance along the y-axis instead of being silently rendered as horizontal. The §9.4.4 axis-swap math lives in a single helper (`GraphicsState::advance_text_matrix`) consumed by the extractor, page renderer, separation renderer, and text rasterizer — horizontal text pays one predicted-not-taken branch per advance. Per-CID `/W2` (§9.7.4.3) and `/DW2` arrays are parsed for vertical metrics; ToUnicode `/WMode` is intentionally ignored per §9.10.2 so a stale tooling leftover can't flip the document. Vertical-majority pages (≥50% of spans tagged `wmode == 1`) bypass the configured `ReadingOrderStrategyType` and route through a dedicated right-to-left column-ordering path, since none of the horizontal strategies can produce correct vertical reading order. Thanks @RayVR. +- **`PathContent::to_points(tolerance)` path flattening (#147)** — flattens an extracted vector path into polylines (`Vec>`, one inner vec per subpath) for consumers that need sampled coordinates rather than drawing operators (chart/ECG/CAD digitisation). `MoveTo`/`LineTo` pass through unchanged; cubic Béziers are adaptively subdivided to stay within `tolerance` of the true curve. Subpath handling follows ISO 32000-1:2008 §8.5.2 (Table 59). Thanks @mbeschastn0v, and @joelparkerhenderson for the use case. +- **Separation-plate image rendering (#631)** — raster Image XObjects are now routed to the matching ink plates in the separation renderer (previously only Form XObjects were handled, so photo content, gradients, and sample-based artwork were absent from per-ink output). Per-pixel routing dispatches by image colour space (ISO 32000-1:2008 §8.9). Thanks @RayVR. +- **`/ActualText` extraction for structure-tree spans (#646)** — `/ActualText` on a `StructElem` (the form InDesign emits for drop caps, ligature spans, and stylized text in tagged PDFs, §14.9.4) is now applied correctly in `extract_text` / `to_markdown` / `to_html` — emitting the replacement text once, at the right position, instead of duplicating it with the raw descendant glyphs. Marked-content-scope `/ActualText` already worked. Thanks @RayVR. +- **Article-thread (`/Threads`) parsing (#458)** — a new parser reads a document's article threads (ISO 32000-1:2008 §12.4.3) into per-page bead rectangles, with an accompanying reading-order strategy, shipped as tested public building blocks. The default reading order is unchanged; auto-wiring threads into it is tracked for a future release. +- **Cross-language test-parity suite** — one shared functional spec (open, extract, convert, search, structured extraction, create, encrypt, version) is now implemented idiomatically in all nine bindings (Rust, Python, Node, Go, Java, Ruby, PHP, C#, WASM), so every binding is verified to expose the same core behavior. +- **Press-accurate CMYK→RGB via document `/OutputIntents` ICC profile (#652)** — the composite render path now consumes the document's `/OutputIntents` CMYK `DestOutputProfile` and routes `/DeviceCMYK` paint, `/Separation` / `/DeviceN` colourants resolving to a `/DeviceCMYK` alternate, and `/ICCBased N=4` spaces lacking a usable embedded profile through `qcms` (ISO 32000-1:2008 §14.11.5, §10). The conversion is built as `qcms::Transform::new_to(src = OutputIntent, dst = sRGB)`, so it uses the OutputIntent profile's AToB ("device-to-PCS") direction into the CIE PCS and then the sRGB profile's PCS-to-device direction out — composite direction CMYK → CIE PCS → sRGB. Closes the press-vs-screen colour divergence on heavy-yellow / saturated-mid-tone branding artwork that previously rendered through the §10.3.5 additive-clamp fallback. When no `/OutputIntents` is declared, §10.3.5 is preserved byte-for-byte. Thanks @RayVR. - **Page-level `/DefaultGray` / `/DefaultRGB` / `/DefaultCMYK` overrides (§8.6.5.6)** — when a page or Form XObject's `/Resources /ColorSpace` declares these defaults, the canonical `g` / `rg` / `k` / `K` operators (and their stroking siblings) are routed through the override colour space before any document-level `/OutputIntents` lookup. A `/DefaultCMYK [/ICCBased ]` override drives the conversion through its embedded profile; the override takes precedence over the document `/OutputIntents` for bare device-family paint. Form XObject overrides take precedence inside the form's scope (§7.8.3). - **Rendering-intent operator (`/RI`) honoured in the render path (§10.7.3)** — the `/RI` operator was being parsed but its value never reached the colour conversion. The graphics-state intent (`/AbsoluteColorimetric` / `/RelativeColorimetric` / `/Saturation` / `/Perceptual`, defaulting to `/RelativeColorimetric`) now flows into every qcms `Transform::new_srgb_target` build. Two `/RI` settings on the same page now compile two distinct transforms instead of silently sharing one. - **ICC v2 and ICC v4 `DestOutputProfile` profiles both supported** through qcms 0.3.0's unconditional header-version check. A v4 LUT8-tag-form profile compiles through the same code path as the v2 equivalent and produces byte-identical RGB. @@ -17,14 +22,33 @@ All notable changes to PDFOxide are documented here. ### Changed -- **`FontInfo` gains three new `pub` fields (`wmode: u8`, `cid_vertical_metrics: Option>`, `cid_default_vertical_metrics: VerticalMetrics`)** — source-breaking for downstream code that constructs `FontInfo` with struct-literal syntax; add the three new fields to fix. Horizontal-only fonts pay no allocation cost (`cid_vertical_metrics: None`, `cid_default_vertical_metrics` is `Copy`). `FontInfo` continues to NOT be `#[non_exhaustive]`, consistent with the 0.3.60 `ascent`/`descent` addition. The natural construction values for the new fields are `wmode: 0`, `cid_vertical_metrics: None`, `cid_default_vertical_metrics: VerticalMetrics::SPEC_DEFAULT`. -- **`ReadingOrderConfig::strategy` is now overridden on vertical-majority pages** — the configured horizontal strategy (Simple, Geometric, XYCut, StructureTreeFirst) is bypassed when a page has ≥50% vertical-writing spans, and the page is ordered through the tategaki path instead. Per-span `wmode` is preserved on every output span so consumers can still distinguish the two modes. See `ReadingOrderConfig::strategy` rustdoc for the rule. -- **`ResolvedColor` gains an `IccCmyk { rgba, cmyk }` dual-payload variant** — `/ICCBased N=4` paint with a parseable embedded profile (and `/DefaultCMYK [/ICCBased N=4]` overrides) emits both the pre-converted RGBA (consumed by the composite backend) and the original CMYK quadruple (consumed by the per-plate separation router). Source-breaking for downstream code that exhaustively matches on `ResolvedColor`; add the new arm to fix. The type is not `#[non_exhaustive]`. +- **Faster extraction on table-heavy pages** — an output-preserving optimization to the table detector cuts ~30% off extraction time for large, dense documents (e.g. regulatory volumes) with byte-identical results. +- **Cross-OS + FIPS example verification in CI** — the core example scenarios for every language binding now run **and assert their output** on Linux, macOS, and Windows (previously Linux-only — the gap that let #648 reach users), including a FIPS-safe run. This is the guard that prevents another platform-specific quickstart regression. +- **Renderer resolution pipeline refactor (#649)** — the copy-pasted paint-resolution arms in `page_renderer` and `separation_renderer` are unified into a single layered `rendering/resolution` module (colour resolution, overprint, blend-mode, clip, per-plate routing), removing quiet divergence between the two renderers. This also fixes PostScript Type 4 calculator tint transforms for `/Separation` and `/DeviceN` spot colours over `DeviceCMYK`/`ICCBased` alternates, which previously fell through to a flat fallback. Thanks @RayVR. +- **CI reliability hardening (#544)** — SHA-pinned the remaining floating GitHub Action and added network retries to reduce transient CI failures. +- **Dependency & CI-action updates** — `imageproc` 0.27, `subsetter` 0.2.6, `log` 0.4.32, and the `actions/checkout`, `taiki-e/install-action`, and `astral-sh/setup-uv` actions were bumped (dependabot, #639/#637/#636/#643/#641/#640). +- **`FontInfo` gains three new `pub` fields (`wmode: u8`, `cid_vertical_metrics: Option>`, `cid_default_vertical_metrics: VerticalMetrics`) (#645)** — source-breaking for downstream code that constructs `FontInfo` with struct-literal syntax; add the three new fields to fix. Horizontal-only fonts pay no allocation cost (`cid_vertical_metrics: None`, `cid_default_vertical_metrics` is `Copy`). `FontInfo` continues to NOT be `#[non_exhaustive]`, consistent with the 0.3.60 `ascent`/`descent` addition. The natural construction values for the new fields are `wmode: 0`, `cid_vertical_metrics: None`, `cid_default_vertical_metrics: VerticalMetrics::SPEC_DEFAULT`. +- **`ReadingOrderConfig::strategy` is now overridden on vertical-majority pages (#645)** — the configured horizontal strategy (Simple, Geometric, XYCut, StructureTreeFirst) is bypassed when a page has ≥50% vertical-writing spans, and the page is ordered through the tategaki path instead. Per-span `wmode` is preserved on every output span so consumers can still distinguish the two modes. See `ReadingOrderConfig::strategy` rustdoc for the rule. +- **`ResolvedColor` gains an `IccCmyk { rgba, cmyk }` dual-payload variant (#652)** — `/ICCBased N=4` paint with a parseable embedded profile (and `/DefaultCMYK [/ICCBased N=4]` overrides) emits both the pre-converted RGBA (consumed by the composite backend) and the original CMYK quadruple (consumed by the per-plate separation router). Source-breaking for downstream code that exhaustively matches on `ResolvedColor`; add the new arm to fix. The type is not `#[non_exhaustive]`. - **`/ICCBased N=4` with an embedded profile now wins over document `/OutputIntents`** (§8.6.5.5). Pre-this-change, an embedded `/ICCBased N=4` colour space with a parseable qcms profile emitted `ResolvedColor::Cmyk` and was projected through the document `/OutputIntents` ICC profile by the composite pipeline — inverting the spec's "embedded ICC trumps OutputIntent". The four components are now routed through the embedded profile directly and the OutputIntent is consulted only when the embedded profile fails to parse or qcms refuses to build a CMM. +### Fixed + +- **Node.js quickstart (#648)** — the documented Quick Start used CommonJS `require` (the package is ESM-only) and `new PdfDocument(path)` (not a public constructor), so the first example threw. Samples now use `import` + `PdfDocument.open(path)`, and the constructor gives an actionable error if handed a path. Thanks @abeq for the report and @lihouwenbin for the docs fix (#651). +- **Form fields filled but not displayed (#647)** — for PDFs with an inline AcroForm, a filled field's value was written but the `/NeedAppearances` flag was dropped on full-rewrite save, so viewers showed the field blank (ISO 32000-1:2008 §12.7.3.3). The flag now survives, and viewers render the filled value. Thanks @mitslabo. +- **macOS OCR engine detection (#632)** — `onnxruntime` auto-discovery missed the versioned `libonnxruntime..dylib` macOS actually ships, so OCR was silently skipped; detection is now version-tolerant across Linux/macOS/Windows. Thanks @paliwalvimal. +- **Deterministic table detection** — several table-detection steps could order results by per-process hash iteration, yielding run-to-run differences on table-heavy pages; these are now deterministic. +- **Decimal points in `CMSY`/Symbol math fonts** — numbers whose decimal point is drawn from a math font's `logicalnot` glyph (e.g. `1¬00`, and the spaced `1¬ 00`) now extract as `1.00`. +- **Consistent empty output for unreadable encrypted PDFs** — all text surfaces (`extract_text`, markdown, HTML, plain text) now uniformly return empty for an encrypted PDF that can't be decrypted, instead of one surface diverging (ISO 32000-1:2008 §7.6). +- **RTL (Hebrew/Arabic) word order in tagged PDFs (#656, #657)** — `extract_text` on a tagged PDF assembles text from the structure tree and never reached the untagged `reverse_rtl_visual_order_runs` pass, so a pure-RTL run's word-spans were emitted in visual (left-to-right) order — the whole line reversed. A new geometric pass (`order_mcid_spans` → `row_aware_span_cmp_rtl`) now emits each pure-RTL row right-to-left (rightmost word first), reconstructing logical reading order from page geometry independent of how the producer stored the run (ISO 32000-1:2008 §14.8, UAX #9 §3.3.4). Hebrew text-extraction error rate on the multilingual benchmark drops from worst-tier to parity with poppler/pdfium. Two companion fixes improve Arabic: a **grapheme-cluster reversal** keeps combining marks (kasra/shadda) attached to their base letter instead of floating off, and a **space-gap threshold fallback** (0.25 em) when a CID subset font omits a space glyph stops the geometric word-gap threshold from collapsing to zero and shattering cursive words into single letters (§9.3.3 — `Tw` does not apply to composite-font Arabic). Remaining Arabic intra-word phantom gaps (glyph ink-width vs advance-width) and the markdown/HTML converter paths are tracked in #656/#657. Mixed RTL+Latin runs are left untouched pending full bidi. +- **Node.js binding missing barrel exports (#653)** — the package's ESM entry re-exports `ContentType`, `ImageFormat`, `ThumbnailManager`/`ThumbnailSize`, and the `OCRDetectionMode`/`OCRLanguage` aliases through `managers/index.js`, but that barrel never re-exported the hybrid-ml and thumbnail modules (the CJS `require` path tolerated the gap silently). Strict ESM consumers — including the new cross-language core-parity test — failed at import with "does not provide an export named 'ContentType'". The managers barrel now re-exports all of them. +- **Java binding native-library load on macOS/Windows (#653)** — `NativeLoader` loaded the explicit `fyi.oxide.pdf.lib.path` override unconditionally, but the Maven build defaults that property to a Linux `.so` path. On macOS (`.dylib`) and Windows (`.dll`) the override file is absent, so `System.load` hard-failed with `UnsatisfiedLinkError` even though the correct platform native is bundled in the JAR. The loader now checks the override exists and otherwise falls through to the bundled resource. Surfaced by v0.3.61's new cross-OS Java JNI test runs. +- **Number corruption in plain-text table cells** — per-glyph table cells (`Td Tj`, e.g. `0.99`, `Q1`) were merged into one span without keeping the `char_widths` array in sync, so the width-based column-spanning-decimal and letter→digit splitters misfired and **dropped the decimal point** (`0.99` → `0 99`) or inserted a spurious space (`Q1` → `Q 1`). `char_widths` is now re-synced on merge (benchmark table CER 0.117→0.067, 0.091→0.061). +- **Spurious word spaces in Indic scripts (Tamil/Bengali/Devanagari)** — Brahmic text extracted the right codepoints but inserted a space after nearly every dependent vowel sign (matra), because a matra carries its own advance and the geometric gap test read matra→consonant as a word break. Tamil/Bengali/Telugu/Kannada/Malayalam are now recognised by the complex-script word-boundary path, the matra→base-consonant boundary is suppressed, and `should_insert_space` gained a combining-mark guard (benchmark text CER: Tamil 0.095→0.035, Bengali 0.175→0.032, Devanagari 0.066→0.016; real word breaks carry an explicit space glyph, §9.3.3). + ### Known limitations -These are intentional gaps the test suite documents with `HONEST_GAP_*` markers so a future engineer (or a qcms upgrade) flips them RED on landing: +These are limitations of the upstream `qcms` 0.3.0 colour engine (items 1–2) and one test-coverage gap (item 3), tracked in #655. The test suite documents each with a `HONEST_GAP_*` marker wired as an upgrade gate, so a future engineer (or a qcms upgrade) flips the gated test RED on landing: - **qcms 0.3.0 ignores the CMYK rendering intent**. The end-to-end intent chain inside pdf_oxide is correct — `gs.rendering_intent` → `ResolutionContext::rendering_intent` → `Transform::new_srgb_target`'s `intent` parameter → qcms — but qcms 0.3.0 declares the intent as `_intent` for CLUT-based CMYK conversion (`transform.rs:1283-1289`) and dispatches the same CLUT for every PDF intent. A qcms upgrade that honours the parameter, or a CMM swap, will surface intent-sensitive behaviour without further code changes; the test `qa_round3_qcms_030_treats_cmyk_intent_as_informational` is the upgrade gate. - **qcms 0.3.0 has no Black-Point Compensation** (`lib.rs:29-36` — upstream documents the choice as intentional). `qa_round4_bpc_paper_white_preservation_under_relative_colorimetric` is `#[ignore]`-marked with `HONEST_GAP_QCMS_030_NO_BPC`. diff --git a/Cargo.lock b/Cargo.lock index ccc685bed..584225c3f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2930,7 +2930,7 @@ dependencies = [ [[package]] name = "pdf_oxide" -version = "0.3.60" +version = "0.3.61" dependencies = [ "aes 0.9.1", "aws-lc-rs", @@ -3024,7 +3024,7 @@ dependencies = [ [[package]] name = "pdf_oxide_cli" -version = "0.3.60" +version = "0.3.61" dependencies = [ "clap", "is-terminal", @@ -3034,7 +3034,7 @@ dependencies = [ [[package]] name = "pdf_oxide_jni" -version = "0.3.60" +version = "0.3.61" dependencies = [ "jni", "pdf_oxide", @@ -3043,7 +3043,7 @@ dependencies = [ [[package]] name = "pdf_oxide_mcp" -version = "0.3.60" +version = "0.3.61" dependencies = [ "pdf_oxide", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index b16662e67..c19efe203 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -58,7 +58,7 @@ manual_checked_ops = "allow" [package] name = "pdf_oxide" -version = "0.3.60" +version = "0.3.61" # MSRV — driven up from 1.82 for v0.3.38. Transitive deps pulled in # this release push the floor to 1.88: # - hybrid-array 0.4.10 (via RustCrypto) → edition 2024 → 1.85 @@ -515,6 +515,22 @@ name = "showcase_barcode_svg" path = "examples/rust/09-new-features/barcode_svg/main.rs" required-features = ["barcodes"] +[[example]] +name = "showcase_compliance_validation" +path = "examples/rust/09-new-features/compliance_validation/main.rs" + +[[example]] +name = "showcase_dashed_stroke" +path = "examples/rust/09-new-features/dashed_stroke/main.rs" + +[[example]] +name = "showcase_encrypted_bytes" +path = "examples/rust/09-new-features/encrypted_bytes/main.rs" + +[[example]] +name = "showcase_page_extraction" +path = "examples/rust/09-new-features/page_extraction/main.rs" + [[example]] name = "showcase_streaming_table" path = "examples/rust/09-new-features/streaming_table/main.rs" diff --git a/csharp/PdfOxide.Tests/CoreParityTests.cs b/csharp/PdfOxide.Tests/CoreParityTests.cs new file mode 100644 index 000000000..bb900ee50 --- /dev/null +++ b/csharp/PdfOxide.Tests/CoreParityTests.cs @@ -0,0 +1,95 @@ +using System; +using System.Text; +using PdfOxide.Core; +using PdfOxide.Exceptions; +using Xunit; + +namespace PdfOxide.Tests +{ + /// + /// Core functional test-parity suite (C#) — mirrors the shared + /// cross-language spec + /// (docs/releases/plans/v0.3.61/core-test-parity-spec.md) with the + /// idiomatic .NET API. Every binding asserts the same behaviors. + /// + /// Each test is self-contained: it builds its own input via + /// and opens it from bytes, so the suite has no + /// fixture-file dependency. + /// + public class CoreParityTests + { + private static byte[] BuildBytes() + { + using var pdf = Pdf.FromText("Core parity across all bindings.\nSecond line of text."); + return pdf.SaveToBytes(); + } + + private static PdfDocument Open() => PdfDocument.Open(BuildBytes()); + + [Fact] + public void OpenAndPageCount() + { + using var doc = Open(); + Assert.True(doc.PageCount >= 1); + } + + [Fact] + public void ExtractTextReturnsString() + { + using var doc = Open(); + Assert.NotNull(doc.ExtractText(0)); + } + + [Fact] + public void ConvertMarkdownHtmlPlain() + { + using var doc = Open(); + Assert.NotNull(doc.ToMarkdown(0)); + Assert.NotNull(doc.ToHtml(0)); + Assert.NotNull(doc.ToPlainText(0)); + } + + [Fact] + public void SearchReturnsResults() + { + using var doc = Open(); + Assert.NotNull(doc.SearchAll("parity")); + } + + [Fact] + public void StructuredExtraction() + { + using var doc = Open(); + Assert.NotNull(doc.ExtractStructured(0)); + } + + [Fact] + public void CreatePdfFromText() + { + var bytes = BuildBytes(); + Assert.True(bytes.Length > 4); + Assert.Equal("%PDF-", Encoding.ASCII.GetString(bytes, 0, 5)); + } + + [Fact] + public void OpenFromBytesPageCount() + { + using var doc = PdfDocument.Open(BuildBytes()); + Assert.True(doc.PageCount >= 1); + } + + [Fact] + public void OpeningMissingPathThrows() + { + Assert.ThrowsAny(() => PdfDocument.Open("/no/such/file/does/not/exist.pdf")); + } + + [Fact] + public void ExposesVersion() + { + using var doc = Open(); + var (major, _) = doc.Version; + Assert.True(major >= 1); + } + } +} diff --git a/csharp/PdfOxide/PdfOxide.csproj b/csharp/PdfOxide/PdfOxide.csproj index 7525dad6e..9545b18c7 100644 --- a/csharp/PdfOxide/PdfOxide.csproj +++ b/csharp/PdfOxide/PdfOxide.csproj @@ -19,7 +19,7 @@ false PdfOxide - 0.3.60 + 0.3.61 PdfOxide pdf_oxide Contributors pdf_oxide Project diff --git a/examples/README.md b/examples/README.md index d21112277..8591a7137 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,39 +1,61 @@ -# PDF Library Examples +# pdf_oxide Examples -This directory contains examples demonstrating various features of pdf_oxide. +Runnable examples demonstrating pdf_oxide across every supported language. The +**core scenarios (01–07)** are mirrored in each language and exercised in CI on +Linux, macOS, and Windows; the **09-new-features** showcases run on Linux. -## Available Examples +## Layout -Run examples with: -```bash -cargo run --example -- [args] +``` +examples/ + rust/ 01..08 + 09-new-features/ (also registered as cargo --example targets) + python/ 01..08 + 09-new-features/ + javascript/ 01..08 + 09-new-features/ + go/ 01..08 + 09-new-features/ + csharp/ 01..08 + 09-new-features/ + java/ 01..07 (core scenarios) + ruby/ 01..07 (core scenarios) + php/ 01..07 (core scenarios) + wasm_node/ extract_text.mjs (WASM on a Node host) ``` -### Basic Examples -- **basic_usage.rs** - Simple text extraction -- **extract_text.rs** - Text extraction with options -- **check_form_tables.rs** - Form field inspection +## Core scenarios (01–07) -### Analysis Examples -- **analyze_font_f1.rs** - Font analysis and debugging -- **analyze_pdf_structure.rs** - PDF structure inspection +| # | Scenario | What it shows | +|---|----------|---------------| +| 01 | extract-text | open a PDF, page count, per-page text | +| 02 | convert-formats | page → Markdown / HTML / plain text | +| 03 | create-pdf | build a PDF from Markdown/HTML/text | +| 04 | search-text | full-text search across pages | +| 05 | extract-structured | words + bounding boxes, lines, tables | +| 06 | edit-document | metadata edit, page delete, merge | +| 07 | forms-annotations | extract form fields + annotations | -### Python Examples -Located in `python/` subdirectory. +08 (batch-processing) and the `09-new-features` showcases (barcodes, signing, +PDF/A & PDF/UA, encryption, image embedding, …) are demonstrated per language +where supported. -## Quick Start +## Running ```bash -# Extract text from a PDF -cargo run --example basic_usage -- path/to/document.pdf +# Rust +cargo run --example tutorial_extract_text -- tests/fixtures/simple.pdf + +# Python +cd examples/python/01-extract-text && python main.py ../../../tests/fixtures/simple.pdf + +# Node.js +node examples/javascript/01-extract-text/index.js tests/fixtures/simple.pdf + +# Go +cd examples/go/01-extract-text && go run main.go ../../../tests/fixtures/simple.pdf -# Check form fields -cargo run --example check_form_tables -- path/to/form.pdf +# C# +dotnet run --project examples/csharp/01-extract-text/ExtractText.csproj -- tests/fixtures/simple.pdf ``` ## Documentation -For comprehensive documentation, see: - API docs: https://docs.rs/pdf_oxide - Main README: [../README.md](../README.md) - Contributing guide: [../CONTRIBUTING.md](../CONTRIBUTING.md) diff --git a/examples/java/01-extract-text/ExtractText.java b/examples/java/01-extract-text/ExtractText.java new file mode 100644 index 000000000..fc86c6d1e --- /dev/null +++ b/examples/java/01-extract-text/ExtractText.java @@ -0,0 +1,26 @@ +// 01 — Extract text (Java) +// +// Opens a PDF, prints the page count, then the text of each page. +// +// Compile + run against the built jar (which embeds the JNI native lib): +// javac -cp pdf_oxide.jar ExtractText.java +// java -cp pdf_oxide.jar:. ExtractText ../../../tests/fixtures/simple.pdf + +import fyi.oxide.pdf.PdfDocument; + +public final class ExtractText { + public static void main(String[] args) { + if (args.length < 1) { + System.err.println("usage: java ExtractText "); + System.exit(1); + } + try (PdfDocument doc = PdfDocument.open(args[0])) { + int pages = doc.pageCount(); + System.out.println("Pages: " + pages); + for (int i = 0; i < pages; i++) { + System.out.println("--- Page " + (i + 1) + " ---"); + System.out.println(doc.extractText(i)); + } + } + } +} diff --git a/examples/php/01-extract-text/main.php b/examples/php/01-extract-text/main.php new file mode 100644 index 000000000..3ec46fff4 --- /dev/null +++ b/examples/php/01-extract-text/main.php @@ -0,0 +1,48 @@ +\n"); + exit(1); +} + +$doc = PdfDocument::open($path); +$pages = $doc->pageCount(); +echo "Pages: {$pages}\n"; +for ($i = 0; $i < $pages; $i++) { + echo '--- Page ' . ($i + 1) . " ---\n"; + echo $doc->extractText($i) . "\n"; +} diff --git a/examples/ruby/01-extract-text/main.rb b/examples/ruby/01-extract-text/main.rb new file mode 100644 index 000000000..1952d404e --- /dev/null +++ b/examples/ruby/01-extract-text/main.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +# 01 — Extract text (Ruby) +# +# Opens a PDF, prints the page count, then the text of each page. +# +# ruby main.rb ../../../tests/fixtures/simple.pdf + +require 'pdf_oxide' + +path = ARGV[0] or abort 'usage: ruby main.rb ' + +PdfOxide.open(path) do |doc| + puts "Pages: #{doc.page_count}" + doc.page_count.times do |i| + puts "--- Page #{i + 1} ---" + puts doc.extract_text(i) + end +end diff --git a/examples/rust/09-new-features/compliance_validation/main.rs b/examples/rust/09-new-features/compliance_validation/main.rs new file mode 100644 index 000000000..ec2c54202 --- /dev/null +++ b/examples/rust/09-new-features/compliance_validation/main.rs @@ -0,0 +1,46 @@ +// PDF/A, PDF/X, PDF/UA compliance validation (Rust parity with the +// python/javascript/go/csharp `compliance_validation` showcases). +// +// Run: cargo run --example showcase_compliance_validation + +use pdf_oxide::compliance::{ + validate_pdf_a, validate_pdf_ua, validate_pdf_x, PdfALevel, PdfUaLevel, PdfXLevel, +}; +use pdf_oxide::error::Result; +use pdf_oxide::writer::DocumentBuilder; +use pdf_oxide::PdfDocument; + +fn main() -> Result<()> { + let mut builder = DocumentBuilder::new(); + builder + .letter_page() + .font("Helvetica", 12.0) + .at(72.0, 720.0) + .heading(1, "Compliance Validation") + .at(72.0, 690.0) + .paragraph("Testing PDF/A, PDF/X, and PDF/UA compliance validators.") + .done(); + let pdf_bytes = builder.build()?; + + println!("Validating PDF/A-2b compliance..."); + let mut doc = PdfDocument::from_bytes(pdf_bytes.clone())?; + let a = validate_pdf_a(&mut doc, PdfALevel::A2b)?; + println!( + " is_compliant: {} errors: {} warnings: {}", + a.is_compliant, + a.errors.len(), + a.warnings.len() + ); + + println!("Validating PDF/X-4 compliance..."); + let mut doc = PdfDocument::from_bytes(pdf_bytes.clone())?; + let x = validate_pdf_x(&mut doc, PdfXLevel::X4)?; + println!(" is_compliant: {} errors: {}", x.is_compliant, x.errors.len()); + + println!("Validating PDF/UA-1 compliance..."); + let mut doc = PdfDocument::from_bytes(pdf_bytes)?; + let ua = validate_pdf_ua(&mut doc, PdfUaLevel::Ua1)?; + println!(" is_compliant: {} errors: {}", ua.is_compliant, ua.errors.len()); + + Ok(()) +} diff --git a/examples/rust/09-new-features/dashed_stroke/main.rs b/examples/rust/09-new-features/dashed_stroke/main.rs new file mode 100644 index 000000000..f64f4c700 --- /dev/null +++ b/examples/rust/09-new-features/dashed_stroke/main.rs @@ -0,0 +1,44 @@ +// Dashed stroke lines and rectangles (Rust-idiomatic; uses `LineStyle::with_dash`). +// +// Run: cargo run --example showcase_dashed_stroke + +use pdf_oxide::error::Result; +use pdf_oxide::writer::{DocumentBuilder, LineStyle}; +use std::path::PathBuf; + +fn main() -> Result<()> { + let out_dir = PathBuf::from("target/examples_output/dashed_stroke"); + std::fs::create_dir_all(&out_dir)?; + + let mut builder = DocumentBuilder::new(); + builder + .letter_page() + .font("Helvetica", 12.0) + .at(72.0, 720.0) + .heading(1, "Dashed Stroke Demo") + .at(72.0, 690.0) + .paragraph("Rectangles and lines drawn with configurable dash patterns.") + // Dashed rectangle: [5 on, 3 off], 2pt blue border. + .stroke_rect( + 72.0, + 560.0, + 300.0, + 80.0, + LineStyle::new(2.0, 0.0, 0.2, 0.8).with_dash(&[5.0, 3.0], 0.0), + ) + // Dashed line: [8 on, 4 off], 1.5pt red. + .stroke_line( + 72.0, + 520.0, + 372.0, + 520.0, + LineStyle::new(1.5, 0.8, 0.0, 0.0).with_dash(&[8.0, 4.0], 0.0), + ) + .done(); + + let bytes = builder.build()?; + let path = out_dir.join("dashed.pdf"); + std::fs::write(&path, &bytes)?; + println!("Wrote {} ({} bytes)", path.display(), bytes.len()); + Ok(()) +} diff --git a/examples/rust/09-new-features/encrypted_bytes/main.rs b/examples/rust/09-new-features/encrypted_bytes/main.rs new file mode 100644 index 000000000..68e667bb6 --- /dev/null +++ b/examples/rust/09-new-features/encrypted_bytes/main.rs @@ -0,0 +1,31 @@ +// Encrypted PDF output (Rust-idiomatic; uses DocumentBuilder::to_bytes_encrypted). +// +// Run: cargo run --example showcase_encrypted_bytes + +use pdf_oxide::error::Result; +use pdf_oxide::writer::DocumentBuilder; +use std::path::PathBuf; + +fn main() -> Result<()> { + let out_dir = PathBuf::from("target/examples_output/encrypted_bytes"); + std::fs::create_dir_all(&out_dir)?; + + let mut builder = DocumentBuilder::new(); + builder + .letter_page() + .font("Helvetica", 12.0) + .at(72.0, 720.0) + .heading(1, "Encrypted PDF") + .at(72.0, 690.0) + .paragraph("This PDF is encrypted with a user and owner password.") + .done(); + + // AES-256 encryption with user + owner passwords (ISO 32000-1 §7.6). + let encrypted = builder.to_bytes_encrypted("user123", "owner123")?; + assert!(encrypted.starts_with(b"%PDF"), "encrypted output must start with %PDF"); + + let path = out_dir.join("encrypted.pdf"); + std::fs::write(&path, &encrypted)?; + println!("Wrote {} ({} bytes, encrypted)", path.display(), encrypted.len()); + Ok(()) +} diff --git a/examples/rust/09-new-features/page_extraction/main.rs b/examples/rust/09-new-features/page_extraction/main.rs new file mode 100644 index 000000000..ba86ba2ed --- /dev/null +++ b/examples/rust/09-new-features/page_extraction/main.rs @@ -0,0 +1,50 @@ +// Page extraction (Rust-idiomatic; uses DocumentEditor::extract_pages_to_bytes). +// +// Builds a 3-page PDF, then extracts pages 1 and 3 into a new PDF. +// +// Run: cargo run --example showcase_page_extraction + +use pdf_oxide::editor::DocumentEditor; +use pdf_oxide::error::Result; +use pdf_oxide::writer::DocumentBuilder; +use pdf_oxide::PdfDocument; +use std::path::PathBuf; + +fn main() -> Result<()> { + let out_dir = PathBuf::from("target/examples_output/page_extraction"); + std::fs::create_dir_all(&out_dir)?; + + // Build a 3-page document. + let mut builder = DocumentBuilder::new(); + for n in 1..=3 { + builder + .letter_page() + .font("Helvetica", 12.0) + .at(72.0, 720.0) + .heading(1, &format!("Page {n}")) + .at(72.0, 690.0) + .paragraph(&format!("This is the content of page {n}.")) + .done(); + } + let bytes = builder.build()?; + println!( + "Built {}-page PDF ({} bytes)", + PdfDocument::from_bytes(bytes.clone())?.page_count()?, + bytes.len() + ); + + // Extract pages 1 and 3 (0-based indices 0 and 2) into a new PDF. + let mut editor = DocumentEditor::from_bytes(bytes)?; + let extracted = editor.extract_pages_to_bytes(&[0, 2])?; + let doc = PdfDocument::from_bytes(extracted.clone())?; + println!( + "Extracted pages [0, 2] → new PDF with {} pages ({} bytes)", + doc.page_count()?, + extracted.len() + ); + + let path = out_dir.join("extracted.pdf"); + std::fs::write(&path, &extracted)?; + println!("Wrote {}", path.display()); + Ok(()) +} diff --git a/examples/wasm_node/core_parity.test.mjs b/examples/wasm_node/core_parity.test.mjs new file mode 100644 index 000000000..a150e74a0 --- /dev/null +++ b/examples/wasm_node/core_parity.test.mjs @@ -0,0 +1,111 @@ +// Core functional test-parity suite (WASM / Node) — mirrors the shared +// cross-language spec (docs/releases/plans/v0.3.61/core-test-parity-spec.md) +// with the idiomatic wasm-bindgen API. Every binding asserts the same +// behaviors. +// +// The WASM package (pdf_oxide.js + .wasm) is produced by wasm-pack in CI and +// is gitignored. If it has not been built, the whole suite self-skips rather +// than failing — matching the graceful-degradation contract used elsewhere. +import assert from 'node:assert'; +import { describe, it, before } from 'node:test'; + +let WasmPdfDocument; +let WasmPdf; +let available = false; + +before(async () => { + try { + const mod = await import('./pdf_oxide.js'); + WasmPdfDocument = mod.WasmPdfDocument; + WasmPdf = mod.WasmPdf; + available = typeof WasmPdfDocument === 'function' && typeof WasmPdf === 'function'; + } catch { + available = false; + } +}); + +function makeBytes() { + const pdf = WasmPdf.fromText( + 'Core parity across all bindings.\nSecond line of text.', + 'Core Parity', + 'pdf_oxide', + ); + return pdf.toBytes(); +} + +function open() { + return new WasmPdfDocument(makeBytes()); +} + +describe('core parity (WASM)', () => { + it('create pdf from text → %PDF', (t) => { + if (!available) return t.skip('wasm package not built'); + const bytes = makeBytes(); + assert.ok(bytes.length > 4); + assert.strictEqual(Buffer.from(bytes.subarray(0, 5)).toString('latin1'), '%PDF-'); + }); + + it('open + page count == 1', (t) => { + if (!available) return t.skip('wasm package not built'); + const doc = open(); + try { + assert.strictEqual(doc.pageCount(), 1); + } finally { + doc.free(); + } + }); + + it('extract text returns a string', (t) => { + if (!available) return t.skip('wasm package not built'); + const doc = open(); + try { + assert.strictEqual(typeof doc.extractText(0), 'string'); + } finally { + doc.free(); + } + }); + + it('convert markdown / html / plain return strings', (t) => { + if (!available) return t.skip('wasm package not built'); + const doc = open(); + try { + assert.strictEqual(typeof doc.toMarkdown(0), 'string'); + assert.strictEqual(typeof doc.toHtml(0), 'string'); + assert.strictEqual(typeof doc.toPlainText(0), 'string'); + } finally { + doc.free(); + } + }); + + it('search returns results without throwing', (t) => { + if (!available) return t.skip('wasm package not built'); + const doc = open(); + try { + const res = doc.search('parity', true); + assert.ok(res !== undefined && res !== null); + } finally { + doc.free(); + } + }); + + it('structured extraction works', (t) => { + if (!available) return t.skip('wasm package not built'); + const doc = open(); + try { + assert.strictEqual(typeof doc.extractStructured(0), 'string'); + } finally { + doc.free(); + } + }); + + it('exposes the PDF version', (t) => { + if (!available) return t.skip('wasm package not built'); + const doc = open(); + try { + const v = doc.version(); + assert.ok(v[0] >= 1); + } finally { + doc.free(); + } + }); +}); diff --git a/go/cmd/install/main.go b/go/cmd/install/main.go index 336c7a142..483690cfe 100644 --- a/go/cmd/install/main.go +++ b/go/cmd/install/main.go @@ -52,7 +52,7 @@ const ( // taken from the build info and THIS constant is irrelevant. That's what // lets `@latest` just work — each tagged release resolves to its own // version automatically, without a sed step in release automation. - fallbackVersion = "0.3.60" + fallbackVersion = "0.3.61" BaseURL = "https://github.com/yfedoseev/pdf_oxide/releases/download" // cacheSubdir lives under os.UserCacheDir() — XDG_CACHE_HOME on Linux, // ~/Library/Caches on macOS (Time-Machine-excluded), %LocalAppData% on diff --git a/go/core_parity_test.go b/go/core_parity_test.go new file mode 100644 index 000000000..b197e7a68 --- /dev/null +++ b/go/core_parity_test.go @@ -0,0 +1,118 @@ +//go:build cgo + +package pdfoxide + +import ( + "bytes" + "os" + "path/filepath" + "testing" +) + +// Core functional test-parity suite (Go) — mirrors the shared cross-language +// spec (docs/releases/plans/v0.3.61/core-test-parity-spec.md) with the +// idiomatic Go API. Every binding asserts the same behaviors. +// +// Go has no on-disk fixture dependency: each case builds its own input from +// text (the same approach the rest of the Go suite uses via createTestPDF). + +func parityPDF(t *testing.T) []byte { + t.Helper() + creator, err := FromText("Core parity across all bindings.\nSecond line of text.") + if err != nil { + t.Skipf("FromText unavailable in this build: %v", err) + } + defer creator.Close() + data, err := creator.SaveToBytes() + if err != nil { + t.Fatalf("SaveToBytes failed: %v", err) + } + return data +} + +func parityOpen(t *testing.T) *PdfDocument { + t.Helper() + doc, err := OpenFromBytes(parityPDF(t)) + if err != nil { + t.Fatalf("OpenFromBytes failed: %v", err) + } + return doc +} + +func TestParity_OpenAndPageCount(t *testing.T) { + doc := parityOpen(t) + defer doc.Close() + n, err := doc.PageCount() + if err != nil { + t.Fatalf("PageCount: %v", err) + } + if n != 1 { + t.Errorf("PageCount = %d, want 1", n) + } +} + +func TestParity_ExtractText(t *testing.T) { + doc := parityOpen(t) + defer doc.Close() + if _, err := doc.ExtractText(0); err != nil { + t.Errorf("ExtractText: %v", err) + } +} + +func TestParity_ConvertMarkdownHTMLPlain(t *testing.T) { + doc := parityOpen(t) + defer doc.Close() + if _, err := doc.ToMarkdown(0); err != nil { + t.Errorf("ToMarkdown: %v", err) + } + if _, err := doc.ToHtml(0); err != nil { + t.Errorf("ToHtml: %v", err) + } + if _, err := doc.ToPlainText(0); err != nil { + t.Errorf("ToPlainText: %v", err) + } +} + +func TestParity_Search(t *testing.T) { + doc := parityOpen(t) + defer doc.Close() + if _, err := doc.SearchAll("parity", false); err != nil { + t.Errorf("SearchAll: %v", err) + } +} + +func TestParity_Structured(t *testing.T) { + doc := parityOpen(t) + defer doc.Close() + if _, err := doc.ExtractStructured(0); err != nil { + t.Errorf("ExtractStructured: %v", err) + } +} + +func TestParity_CreatePDF(t *testing.T) { + if data := parityPDF(t); !bytes.HasPrefix(data, []byte("%PDF")) { + t.Errorf("created bytes do not start with %%PDF") + } +} + +func TestParity_OpenFromFile(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "parity.pdf") + if err := os.WriteFile(path, parityPDF(t), 0o600); err != nil { + t.Fatalf("WriteFile: %v", err) + } + doc, err := Open(path) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer doc.Close() + if n, _ := doc.PageCount(); n != 1 { + t.Errorf("PageCount = %d, want 1", n) + } +} + +func TestParity_OpenError(t *testing.T) { + if _, err := Open("/no/such/file/does/not/exist.pdf"); err == nil { + t.Error("expected error opening a missing file, got nil") + } +} diff --git a/java/pom.xml b/java/pom.xml index b205b9199..00e573e3c 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -8,7 +8,7 @@ namespace verification under oxide.fyi) artifactId: pdf-oxide (the Maven artifact; matches the package fyi.oxide.pdf) - version: 0.3.60 (lockstep with Cargo workspace / + version: 0.3.61 (lockstep with Cargo workspace / js/package.json / .csproj / pyproject.toml — release-preflight from v0.3.51 #515 enforces parity) @@ -33,7 +33,7 @@ fyi.oxide pdf-oxide - 0.3.60 + 0.3.61 jar pdf_oxide — Java binding @@ -72,7 +72,7 @@ scm:git:https://github.com/yfedoseev/pdf_oxide.git scm:git:git@github.com:yfedoseev/pdf_oxide.git https://github.com/yfedoseev/pdf_oxide - v0.3.60 + v0.3.61 diff --git a/java/src/main/java/fyi/oxide/pdf/internal/NativeLoader.java b/java/src/main/java/fyi/oxide/pdf/internal/NativeLoader.java index 85575594f..f3db52196 100644 --- a/java/src/main/java/fyi/oxide/pdf/internal/NativeLoader.java +++ b/java/src/main/java/fyi/oxide/pdf/internal/NativeLoader.java @@ -117,12 +117,21 @@ public static void ensureLoaded() { } private static void doLoad() { - // 1. Explicit override. + // 1. Explicit override — only when the file actually exists. The Maven + // build sets this property to a Linux `.so` default (pom.xml); on macOS + // (`.dylib`) and Windows (`.dll`) that path is absent, so loading it + // would hard-fail with UnsatisfiedLinkError even though the correct + // platform native is bundled. Fall through to the bundled resource in + // that case instead of failing. final String overridePath = System.getProperty(PROP_LIB_PATH); if (overridePath != null && !overridePath.isEmpty()) { - LOG.debug("Loading pdf_oxide_jni from -D{}={}", PROP_LIB_PATH, overridePath); - System.load(overridePath); - return; + if (Files.exists(Paths.get(overridePath))) { + LOG.debug("Loading pdf_oxide_jni from -D{}={}", PROP_LIB_PATH, overridePath); + System.load(overridePath); + return; + } + LOG.debug( + "Override -D{}={} does not exist; falling through to bundled native", PROP_LIB_PATH, overridePath); } // 2. System library opt-in. diff --git a/java/src/test/java/fyi/oxide/pdf/CoreParityTest.java b/java/src/test/java/fyi/oxide/pdf/CoreParityTest.java new file mode 100644 index 000000000..ef33ec33e --- /dev/null +++ b/java/src/test/java/fyi/oxide/pdf/CoreParityTest.java @@ -0,0 +1,92 @@ +/* + * Copyright 2025-2026 Yury Fedoseev and pdf_oxide contributors. + * Licensed under MIT OR Apache-2.0. + */ +package fyi.oxide.pdf; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import fyi.oxide.pdf.exception.PdfIoException; +import org.junit.jupiter.api.Test; + +/** + * Core functional test-parity suite (Java) — mirrors the shared cross-language + * spec ({@code docs/releases/plans/v0.3.61/core-test-parity-spec.md}) with the + * idiomatic Java API. Every binding asserts the same behaviors. + * + *

Each case is self-contained: it builds its own input via + * {@link Pdf#fromMarkdown(String)} and opens it from bytes, so there is no + * fixture-file dependency. + */ +class CoreParityTest { + + private static final String MARKDOWN = "# Core Parity\n\nFunctional parity across all language bindings.\n"; + + private static byte[] buildBytes() { + try (Pdf pdf = Pdf.fromMarkdown(MARKDOWN)) { + return pdf.save(); + } + } + + private static PdfDocument open() { + return PdfDocument.open(buildBytes()); + } + + @Test + void openAndPageCount() { + try (PdfDocument doc = open()) { + assertThat(doc.pageCount()).isGreaterThanOrEqualTo(1); + } + } + + @Test + void extractTextReturnsString() { + try (PdfDocument doc = open()) { + assertThat(doc.extractText(0)).isNotNull(); + } + } + + @Test + void convertMarkdownAndHtmlReturnStrings() { + try (PdfDocument doc = open()) { + assertThat(doc.toMarkdown(0)).isNotNull(); + assertThat(doc.toHtml(0)).isNotNull(); + } + } + + @Test + void searchReturnsList() { + try (PdfDocument doc = open()) { + assertThat(doc.search("parity")).isNotNull(); + } + } + + @Test + void structuredExtraction() { + try (PdfDocument doc = open()) { + assertThat(doc.extractStructured(0)).isNotNull(); + } + } + + @Test + void createPdfFromMarkdown() { + byte[] bytes = buildBytes(); + assertThat(bytes).hasSizeGreaterThan(4); + assertThat(new String(bytes, 0, 5, java.nio.charset.StandardCharsets.ISO_8859_1)) + .isEqualTo("%PDF-"); + } + + @Test + void openFromBytesPageCount() { + try (PdfDocument doc = PdfDocument.open(buildBytes())) { + assertThat(doc.pageCount()).isGreaterThanOrEqualTo(1); + } + } + + @Test + void openingMissingPathThrows() { + assertThatThrownBy(() -> PdfDocument.open("/no/such/file/does/not/exist.pdf")) + .isInstanceOf(PdfIoException.class); + } +} diff --git a/js/README.md b/js/README.md index 1f0c87a19..8bc76a9ab 100644 --- a/js/README.md +++ b/js/README.md @@ -24,6 +24,11 @@ const markdown = doc.toMarkdown(0); doc.close(); ``` +> `pdf-oxide` is an ES module. Use `import` (shown above). From CommonJS, load it +> with a dynamic import: `const { PdfDocument } = await import("pdf-oxide");`. +> Open a file with the `PdfDocument.open(path)` factory — the constructor is +> internal and does not take a path. + TypeScript: ```typescript diff --git a/js/package.json b/js/package.json index 84fff37c5..0adfb599c 100644 --- a/js/package.json +++ b/js/package.json @@ -1,6 +1,6 @@ { "name": "pdf-oxide", - "version": "0.3.60", + "version": "0.3.61", "type": "module", "description": "High-performance PDF parsing and text extraction library — prebuilt native bindings, no build toolchain required", "main": "lib/index.js", @@ -20,7 +20,7 @@ "check:publint": "publint", "check:types": "attw --pack . --ignore-rules=cjs-resolves-to-esm", "audit:prod": "npm audit --omit=dev --audit-level=high", - "test": "node --test tests/smoke.test.mjs tests/feature-guard.test.mjs tests/api-coverage.test.mjs tests/html-css.test.mjs tests/document-builder.test.mjs tests/document-editor.test.mjs tests/render-options.test.mjs tests/tables.test.mjs tests/worker-threads-safety.test.mjs" + "test": "node --test tests/smoke.test.mjs tests/feature-guard.test.mjs tests/api-coverage.test.mjs tests/html-css.test.mjs tests/document-builder.test.mjs tests/document-editor.test.mjs tests/render-options.test.mjs tests/tables.test.mjs tests/worker-threads-safety.test.mjs tests/readme-quickstart.test.mjs tests/core-parity.test.mjs" }, "files": [ "lib/", diff --git a/js/src/index.ts b/js/src/index.ts index 83897540a..4f83f64f9 100644 --- a/js/src/index.ts +++ b/js/src/index.ts @@ -370,6 +370,11 @@ class PdfDocumentImpl { private _muPromise: Promise | null = null; constructor(handle: any) { + if (typeof handle === 'string') { + throw new Error( + `PdfDocument constructor takes an internal handle, not a path. Use PdfDocument.open(${JSON.stringify(handle)}) to open a file.` + ); + } if (!handle) throw new Error('Failed to open document'); this._handle = handle; } diff --git a/js/src/managers/index.ts b/js/src/managers/index.ts index e790c582d..58a1c665f 100644 --- a/js/src/managers/index.ts +++ b/js/src/managers/index.ts @@ -46,6 +46,20 @@ export { FormFieldManager, FormFieldType, } from '../form-field-manager.js'; +// Hybrid ML page-analysis manager and its enums. Re-exported here so the +// barrel matches the symbols `../index.ts` (and the ESM package entry) pull +// from `./managers/index.js` — `ContentType` in particular is consumed by the +// cross-language core-parity test, which failed at import time when the +// managers barrel omitted it. +export { + type ColumnRegion, + ContentType, + type ExtractionStrategy, + HybridMLManager, + type PageAnalysisResult, + PageComplexity, + type TableRegion, +} from '../hybrid-ml-manager.js'; // Phase 1 Expansion: Result Accessors and Forms export { type AnnotationProperties, @@ -54,6 +68,18 @@ export { ResultAccessorsManager, type SearchResultProperties, } from '../result-accessors-manager.js'; +// Thumbnail manager and its enums. Same rationale: `../index.ts` and the ESM +// entry re-export `ThumbnailManager` / `ThumbnailSize` / `ImageFormat` from this +// barrel, which previously omitted the whole module (the CJS require path +// tolerated the gap silently; the strict ESM parity test does not). +export { + ImageFormat, + type ThumbnailConfig, + type ThumbnailInfo, + ThumbnailManager, + ThumbnailSize, + type ThumbnailStatistics, +} from '../thumbnail-manager.js'; export { AccessibilityManager, type AutoTagResult, @@ -134,12 +160,18 @@ export { MetadataManager, type ValidationResult, } from './metadata-manager.js'; - // Canonical Managers (Phase 9 consolidation) +// Upper-case `OCR*` aliases. The package's public surface exposes the +// historical `OCRDetectionMode` / `OCRLanguage` spellings (alongside the +// canonical `Ocr*` ones), and `../index.ts` re-exports them through this +// barrel. `OCRManager` already had its alias in `ocr-manager.ts`; the enum +// aliases were missing, so the strict ESM core-parity import failed on them. export { OCRManager, type OcrConfig, OcrDetectionMode, + OcrDetectionMode as OCRDetectionMode, + OcrLanguage as OCRLanguage, OcrManager, type OcrPageAnalysis, type OcrSpan, diff --git a/js/tests/core-parity.test.mjs b/js/tests/core-parity.test.mjs new file mode 100644 index 000000000..a58f22335 --- /dev/null +++ b/js/tests/core-parity.test.mjs @@ -0,0 +1,77 @@ +// Core functional test-parity suite (Node) — mirrors the shared cross-language +// spec (docs/releases/plans/v0.3.61/core-test-parity-spec.md) with the idiomatic +// Node API. +import assert from 'node:assert'; +import { readFileSync } from 'node:fs'; +import { dirname, join } from 'node:path'; +import { describe, it } from 'node:test'; +import { fileURLToPath } from 'node:url'; +import { Pdf, PdfDocument } from '../lib/index.js'; + +const here = dirname(fileURLToPath(import.meta.url)); +const fixture = join(here, '..', '..', 'tests', 'fixtures', 'simple.pdf'); + +const open = () => PdfDocument.open(fixture); + +describe('core parity (Node)', () => { + it('open + page count == 1', () => { + const doc = open(); + try { + assert.strictEqual(doc.getPageCount(), 1); + } finally { + doc.close(); + } + }); + + it('extract text returns a string', () => { + const doc = open(); + try { + assert.strictEqual(typeof doc.extractText(0), 'string'); + } finally { + doc.close(); + } + }); + + it('convert markdown / html / plain return strings', () => { + const doc = open(); + try { + assert.strictEqual(typeof doc.toMarkdown(0), 'string'); + assert.strictEqual(typeof doc.toHtml(0), 'string'); + assert.strictEqual(typeof doc.toPlainText(0), 'string'); + } finally { + doc.close(); + } + }); + + it('search returns results without throwing', () => { + const doc = open(); + try { + // searchAll is the idiomatic doc-level search in the Node binding + // (mirrors Go's SearchAll / C#'s SearchAll in the parity spec). + const res = doc.searchAll('the'); + assert.ok(res !== undefined && res !== null); + } finally { + doc.close(); + } + }); + + it('create pdf from text → %PDF', () => { + const bytes = Pdf.fromText('Core parity across all bindings.').saveToBytes(); + assert.ok(bytes.length > 0); + assert.strictEqual(bytes.subarray(0, 4).toString('latin1'), '%PDF'); + }); + + it('open from buffer (in-memory bytes)', () => { + const buf = readFileSync(fixture); + const doc = PdfDocument.openFromBuffer(buf); + try { + assert.strictEqual(doc.getPageCount(), 1); + } finally { + doc.close(); + } + }); + + it('opening a missing path throws', () => { + assert.throws(() => PdfDocument.open('/no/such/file/does/not/exist.pdf')); + }); +}); diff --git a/js/tests/html-css.test.mjs b/js/tests/html-css.test.mjs index c572703d7..3c0bcd43b 100644 --- a/js/tests/html-css.test.mjs +++ b/js/tests/html-css.test.mjs @@ -15,6 +15,8 @@ const __dir = dirname(fileURLToPath(import.meta.url)); // otherwise skip font-cascade tests. async function loadFont() { const candidates = [ + // Git-tracked font — available on every OS runner (ubuntu/macOS/windows). + join(__dir, '../../tests/fixtures/fonts/DejaVuSans.ttf'), join(__dir, '../../tools/benchmark-harness/fixtures/fonts/DejaVuSans.ttf'), join(__dir, '../fixtures/DejaVuSans.ttf'), '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', diff --git a/js/tests/readme-quickstart.test.mjs b/js/tests/readme-quickstart.test.mjs new file mode 100644 index 000000000..2795096fb --- /dev/null +++ b/js/tests/readme-quickstart.test.mjs @@ -0,0 +1,35 @@ +// Regression test for issue #648 — the documented Node.js quickstart must work, +// and the common misuse (`new PdfDocument(path)`) must fail with an actionable +// error instead of a cryptic native `invalid arguments` TypeError. +import assert from 'node:assert'; +import { dirname, join } from 'node:path'; +import { describe, it } from 'node:test'; +import { fileURLToPath } from 'node:url'; +import { PdfDocument } from '../lib/index.js'; + +const here = dirname(fileURLToPath(import.meta.url)); +const fixture = join(here, '..', '..', 'tests', 'fixtures', 'simple.pdf'); + +describe('README quickstart (#648)', () => { + it('exposes the PdfDocument.open factory', () => { + assert.strictEqual(typeof PdfDocument.open, 'function'); + }); + + it('the documented happy path works: PdfDocument.open(path).extractText(0)', () => { + const doc = PdfDocument.open(fixture); + try { + const text = doc.extractText(0); + assert.strictEqual(typeof text, 'string'); + } finally { + doc.close(); + } + }); + + it('new PdfDocument(path) throws an actionable error pointing at .open()', () => { + assert.throws( + () => new PdfDocument('report.pdf'), + /Use PdfDocument\.open\(/, + 'constructor must reject a path string with a message that names PdfDocument.open' + ); + }); +}); diff --git a/js/tests/render-options.test.mjs b/js/tests/render-options.test.mjs index f36729759..6e04e09c7 100644 --- a/js/tests/render-options.test.mjs +++ b/js/tests/render-options.test.mjs @@ -108,9 +108,12 @@ test('renderToPixmap dimensions match renderPageWithOptions at same DPI', { skip const doc = makeDoc(); const pngBytes = doc.renderPageWithOptions(0, { dpi: 72 }); const px = doc.renderToPixmap(0, 72); + // renderPageWithOptions returns a plain Uint8Array; wrap it (without copying) + // so the Buffer big-endian readers are available. + const png = Buffer.from(pngBytes.buffer, pngBytes.byteOffset, pngBytes.byteLength); // PNG IHDR: width at bytes 16-19, height at 20-23 (big-endian) - const pngW = pngBytes.readUInt32BE(16); - const pngH = pngBytes.readUInt32BE(20); + const pngW = png.readUInt32BE(16); + const pngH = png.readUInt32BE(20); assert.strictEqual(px.width, pngW, 'width must match PNG IHDR'); assert.strictEqual(px.height, pngH, 'height must match PNG IHDR'); }); diff --git a/pdf_oxide_cli/Cargo.toml b/pdf_oxide_cli/Cargo.toml index b55e90a0a..b229b272c 100644 --- a/pdf_oxide_cli/Cargo.toml +++ b/pdf_oxide_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pdf_oxide_cli" -version = "0.3.60" +version = "0.3.61" edition = "2021" description = "CLI for pdf-oxide — the fastest PDF toolkit. 22 commands: text extraction, PDF to markdown, search, merge, split, images, compress, encrypt, watermark, forms, and more." license = "MIT OR Apache-2.0" @@ -34,7 +34,7 @@ workspace = true ocr = ["pdf_oxide/ocr"] [dependencies] -pdf_oxide = { version = "0.3.60", path = "..", features = ["rendering", "logging"] } +pdf_oxide = { version = "0.3.61", path = "..", features = ["rendering", "logging"] } clap = { version = "4", features = ["derive"] } is-terminal = "0.4" serde_json = "1.0" diff --git a/pdf_oxide_jni/Cargo.toml b/pdf_oxide_jni/Cargo.toml index f346e21e0..fb82ffaff 100644 --- a/pdf_oxide_jni/Cargo.toml +++ b/pdf_oxide_jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pdf_oxide_jni" -version = "0.3.60" +version = "0.3.61" edition = "2021" description = "JNI bindings for pdf_oxide — native Java binding, the 8th surface alongside Python/Go/JS/C#/WASM/CLI/MCP. Loaded by the fyi.oxide:pdf-oxide Maven artifact." license = "MIT OR Apache-2.0" @@ -93,7 +93,7 @@ jni = "0.22" # opt-in FIPS 140-3 build) — those two are compile-time mutually # exclusive (pdf_oxide enforces via compile_error!). We always # enable `icc` for ICC-based colour management. -pdf_oxide = { version = "0.3.60", path = "..", default-features = false, features = ["icc"] } +pdf_oxide = { version = "0.3.61", path = "..", default-features = false, features = ["icc"] } # JSON envelope for the v0.3.51 AutoExtractor rich-result path. The # Java side gets the PageExtraction / DocumentExtraction as a JSON diff --git a/pdf_oxide_mcp/Cargo.toml b/pdf_oxide_mcp/Cargo.toml index 4967d0a6f..06a717d9b 100644 --- a/pdf_oxide_mcp/Cargo.toml +++ b/pdf_oxide_mcp/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pdf_oxide_mcp" -version = "0.3.60" +version = "0.3.61" edition = "2021" description = "MCP server for PDF extraction — gives Claude, Cursor, and AI assistants the ability to read PDFs locally. Text, markdown, and HTML output. Powered by pdf_oxide." license = "MIT OR Apache-2.0" @@ -19,7 +19,7 @@ path = "src/main.rs" workspace = true [dependencies] -pdf_oxide = { version = "0.3.60", path = ".." } +pdf_oxide = { version = "0.3.61", path = ".." } serde_json = "1.0" [dev-dependencies] diff --git a/php/scripts/download-native-lib.php b/php/scripts/download-native-lib.php index 203112a52..b50743b79 100644 --- a/php/scripts/download-native-lib.php +++ b/php/scripts/download-native-lib.php @@ -37,7 +37,7 @@ * - Set `PDF_OXIDE_NATIVE_VERSION=vX.Y.Z` to pin a specific release. */ -const PACKAGE_VERSION_DEFAULT = 'v0.3.60'; +const PACKAGE_VERSION_DEFAULT = 'v0.3.61'; const RELEASE_BASE_URL = 'https://github.com/yfedoseev/pdf_oxide/releases/download'; // Path is relative to the package root (parent-of-php in the new // root-composer.json layout); see comment on $packageRoot below. @@ -253,12 +253,12 @@ function downloadFile(string $url, string $dest): bool 'http' => [ 'follow_location' => 1, 'timeout' => 60, - 'user_agent' => 'pdf_oxide-php-installer/0.3.60', + 'user_agent' => 'pdf_oxide-php-installer/0.3.61', ], 'https' => [ 'follow_location' => 1, 'timeout' => 60, - 'user_agent' => 'pdf_oxide-php-installer/0.3.60', + 'user_agent' => 'pdf_oxide-php-installer/0.3.61', ], ]); $data = @file_get_contents($url, false, $ctx); diff --git a/php/src/Pdf.php b/php/src/Pdf.php index 445dd1fa0..d9a01b48f 100644 --- a/php/src/Pdf.php +++ b/php/src/Pdf.php @@ -152,7 +152,7 @@ public static function version(): string * pdf_oxide library version. Kept in sync with `Cargo.toml` by the * release tooling (see `docs/releases/RELEASE_PROCESS.md`). */ - public const VERSION = '0.3.60'; + public const VERSION = '0.3.61'; /** Whether OCR-model prefetch + cache are available on this build. */ public static function prefetchAvailable(): bool diff --git a/php/tests/Integration/CoreParityTest.php b/php/tests/Integration/CoreParityTest.php new file mode 100644 index 000000000..d64471b4e --- /dev/null +++ b/php/tests/Integration/CoreParityTest.php @@ -0,0 +1,97 @@ +fixture('simple.pdf')); + try { + $this->assertSame(1, $doc->pageCount()); + } finally { + $doc->close(); + } + } + + public function testExtractTextReturnsString(): void + { + $doc = PdfDocument::open($this->fixture('simple.pdf')); + try { + $this->assertIsString($doc->extractText(0)); + } finally { + $doc->close(); + } + } + + public function testConvertMarkdownAndHtmlReturnStrings(): void + { + $doc = PdfDocument::open($this->fixture('simple.pdf')); + try { + $this->assertIsString($doc->toMarkdown(0)); + $this->assertIsString($doc->toHtml(0)); + } finally { + $doc->close(); + } + } + + public function testStructuredExtraction(): void + { + $doc = PdfDocument::open($this->fixture('simple.pdf')); + try { + $this->assertIsArray($doc->extractStructured(0)); + } finally { + $doc->close(); + } + } + + public function testCreatePdfFromText(): void + { + $pdf = Pdf::fromText('Core parity across all bindings.'); + try { + $bytes = $pdf->save(); + $this->assertSame('%PDF-', substr($bytes, 0, 5)); + } finally { + $pdf->close(); + } + } + + public function testOpenFromBytes(): void + { + $bytes = (string) file_get_contents($this->fixture('simple.pdf')); + $doc = PdfDocument::openBytes($bytes); + try { + $this->assertSame(1, $doc->pageCount()); + } finally { + $doc->close(); + } + } + + public function testOpeningMissingPathThrows(): void + { + $this->expectException(IoException::class); + PdfDocument::open('/no/such/file/does/not/exist.pdf'); + } + + public function testVersionConstant(): void + { + $this->assertSame('0.3.61', Pdf::VERSION); + } +} diff --git a/pyproject.toml b/pyproject.toml index f0996bb84..db2889e12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "pdf_oxide" -version = "0.3.60" +version = "0.3.61" description = "The fastest Python PDF library: 0.8ms mean, 5× faster than PyMuPDF. Text extraction, markdown conversion, PDF creation. 100% pass rate on 3,830 PDFs." readme = "README.md" requires-python = ">=3.8" diff --git a/python/pdf_oxide/__init__.py b/python/pdf_oxide/__init__.py index e1010ef4c..4df9c1ff0 100644 --- a/python/pdf_oxide/__init__.py +++ b/python/pdf_oxide/__init__.py @@ -47,6 +47,29 @@ from typing import NamedTuple +def _is_ort_lib(name: str) -> bool: + """True if ``name`` is an onnxruntime shared-library filename. + + Mirrors onnxruntime's own packaging globs across platforms: + + * Linux ``libonnxruntime.so`` / ``libonnxruntime.so.1.20.1`` (version + *after* the extension) + * macOS ``libonnxruntime.dylib`` / ``libonnxruntime.1.16.0.dylib`` + (version *before* the extension — the case missed by #632) + * Windows ``onnxruntime.dll`` + + Auxiliary provider libraries (``libonnxruntime_providers_*``) are excluded: + the version separator is required to be ``.`` immediately after + ``libonnxruntime``, which ``_providers`` does not satisfy. + """ + if name == "onnxruntime.dll": + return True + if name.startswith("libonnxruntime.so"): + return True + # macOS versions the library before the extension: libonnxruntime[.].dylib + return name.startswith("libonnxruntime.") and name.endswith(".dylib") + + def _setup_ort_dylib_path() -> None: """Point ort's dynamic loader at the onnxruntime library shipped by the Python ``onnxruntime`` package, if installed and not already overridden. @@ -73,15 +96,11 @@ def _setup_ort_dylib_path() -> None: capi_dir / "libonnxruntime.dylib", capi_dir / "onnxruntime.dll", ] - # Also match versioned names like libonnxruntime.so.1.20.1 + # Also match versioned names: libonnxruntime.so.1.20.1 (Linux) and + # libonnxruntime.1.16.0.dylib (macOS — version before the extension). if capi_dir.is_dir(): for f in capi_dir.iterdir(): - name = f.name - if ( - name.startswith("libonnxruntime.so") - or name.startswith("libonnxruntime.dylib") - or name == "onnxruntime.dll" - ): + if _is_ort_lib(f.name): candidates.insert(0, f) for candidate in candidates: if candidate.exists(): diff --git a/python/tests/test_core_parity.py b/python/tests/test_core_parity.py new file mode 100644 index 000000000..162d41b1b --- /dev/null +++ b/python/tests/test_core_parity.py @@ -0,0 +1,85 @@ +"""Core functional test-parity suite (Python). + +Mirrors the shared cross-language spec +(docs/releases/plans/v0.3.61/core-test-parity-spec.md) using the idiomatic +Python API. Same behaviors are asserted in every binding.""" + +import os + +import pytest + +import pdf_oxide + + +HERE = os.path.dirname(__file__) +FIXTURE = os.path.normpath(os.path.join(HERE, "..", "..", "tests", "fixtures", "simple.pdf")) + + +def _open(): + return pdf_oxide.PdfDocument(FIXTURE) + + +def _build_bytes() -> bytes: + b = pdf_oxide.DocumentBuilder() + ( + b.letter_page() + .font("Helvetica", 12) + .at(72, 720) + .heading(1, "Core Parity") + .at(72, 690) + .paragraph("Functional parity across all language bindings.") + .done() + ) + return b.build() + + +def test_open_and_page_count(): + assert _open().page_count() == 1 + + +def test_extract_text(): + assert isinstance(_open().extract_text(0), str) + + +def test_convert_markdown(): + assert isinstance(_open().to_markdown(0), str) + + +def test_convert_html(): + assert isinstance(_open().to_html(0), str) + + +def test_convert_plain(): + assert isinstance(_open().to_plain_text(0), str) + + +def test_search(): + assert isinstance(_open().search("the"), list) + + +def test_structured(): + assert _open().extract_structured(0) is not None + + +def test_create_pdf(): + assert _build_bytes().startswith(b"%PDF") + + +def test_from_bytes(): + assert pdf_oxide.PdfDocument.from_bytes(_build_bytes()).page_count() == 1 + + +def test_encrypt_roundtrip(): + plain = _build_bytes() + enc = pdf_oxide.PdfDocument.from_bytes(plain).to_bytes_encrypted(user_password="user123") + assert enc.startswith(b"%PDF") + assert enc != plain # encryption changed the bytes + + +def test_open_error(): + with pytest.raises(Exception): # noqa: B017 + pdf_oxide.PdfDocument("/no/such/file/does/not/exist.pdf") + + +def test_version(): + assert pdf_oxide.VERSION == "0.3.61" diff --git a/python/tests/test_ort_dylib_detection.py b/python/tests/test_ort_dylib_detection.py new file mode 100644 index 000000000..197f4debe --- /dev/null +++ b/python/tests/test_ort_dylib_detection.py @@ -0,0 +1,41 @@ +"""#632: onnxruntime shared-library name detection must cover the macOS +versioned ``.dylib`` form (``libonnxruntime.1.16.0.dylib``), where the version +precedes the extension — previously only the unversioned ``libonnxruntime.dylib`` +matched, so installed onnxruntime was never found on macOS and OCR was skipped. + +The matcher is pure string logic, so this runs on any OS.""" + +import pytest + +from pdf_oxide import _is_ort_lib + + +@pytest.mark.parametrize( + "name", + [ + "libonnxruntime.so", # Linux unversioned + "libonnxruntime.so.1.20.1", # Linux versioned (version after ext) + "libonnxruntime.dylib", # macOS unversioned + "libonnxruntime.1.16.0.dylib", # macOS versioned (#632 regression) + "libonnxruntime.1.20.1.dylib", # macOS versioned + "onnxruntime.dll", # Windows + ], +) +def test_recognizes_onnxruntime_libraries(name): + assert _is_ort_lib(name) is True + + +@pytest.mark.parametrize( + "name", + [ + "libonnxruntime_providers_shared.dylib", # provider lib, not the runtime + "libonnxruntime_providers_shared.so", + "libonnxruntime_providers_cuda.dylib", + "libfoo.dylib", + "onnxruntime.txt", + "README", + "", + ], +) +def test_rejects_non_runtime_files(name): + assert _is_ort_lib(name) is False diff --git a/ruby/lib/pdf_oxide/version.rb b/ruby/lib/pdf_oxide/version.rb index ad428827c..77285dcf3 100644 --- a/ruby/lib/pdf_oxide/version.rb +++ b/ruby/lib/pdf_oxide/version.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true module PdfOxide - VERSION = '0.3.60' + VERSION = '0.3.61' end diff --git a/ruby/spec/cdylib_smoke_spec.rb b/ruby/spec/cdylib_smoke_spec.rb index e324286a7..a4a2bd2cf 100644 --- a/ruby/spec/cdylib_smoke_spec.rb +++ b/ruby/spec/cdylib_smoke_spec.rb @@ -11,7 +11,7 @@ RSpec.describe 'libpdf_oxide cdylib smoke' do it 'loads the gem with the expected version' do expect(defined?(PdfOxide)).to eq('constant') - expect(PdfOxide::VERSION).to eq('0.3.60') + expect(PdfOxide::VERSION).to eq('0.3.61') end it 'exposes every public-API class' do diff --git a/ruby/spec/core_parity_spec.rb b/ruby/spec/core_parity_spec.rb new file mode 100644 index 000000000..78d5b209f --- /dev/null +++ b/ruby/spec/core_parity_spec.rb @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +# Core functional test-parity suite (Ruby) — mirrors the shared cross-language +# spec (docs/releases/plans/v0.3.61/core-test-parity-spec.md) with the idiomatic +# Ruby API. + +require 'pdf_oxide' + +RSpec.describe 'core parity (Ruby)' do + fixture = File.expand_path('../../tests/fixtures/simple.pdf', __dir__) + + it 'open + page count == 1' do + PdfOxide::PdfDocument.open(fixture) do |doc| + expect(doc.page_count).to eq(1) + end + end + + it 'extract text returns a String' do + PdfOxide::PdfDocument.open(fixture) { |doc| expect(doc.extract_text(0)).to be_a(String) } + end + + it 'convert markdown / html return Strings' do + PdfOxide::PdfDocument.open(fixture) do |doc| + expect(doc.to_markdown(0)).to be_a(String) + expect(doc.to_html(0)).to be_a(String) + end + end + + it 'search returns an Array' do + PdfOxide::PdfDocument.open(fixture) { |doc| expect(doc.search('the')).to be_a(Array) } + end + + it 'structured extraction works' do + PdfOxide::PdfDocument.open(fixture) { |doc| expect(doc.extract_structured(0)).not_to be_nil } + end + + it 'create pdf from text → %PDF' do + bytes = PdfOxide::Pdf.from_text('Core parity across all bindings.').to_bytes + expect(bytes[0, 5]).to eq('%PDF-') + end + + it 'opening a missing path raises' do + expect { PdfOxide::PdfDocument.open('/no/such/file/does/not/exist.pdf') }.to raise_error(StandardError) + end + + it 'exposes version 0.3.61' do + expect(PdfOxide::VERSION).to eq('0.3.61') + end +end diff --git a/src/document.rs b/src/document.rs index eda2f4947..5a28c9637 100644 --- a/src/document.rs +++ b/src/document.rs @@ -7075,15 +7075,36 @@ impl PdfDocument { } } if rtl >= 2 && !has_latin { - let reversed: String = span.text.chars().rev().collect(); let mut tmp = span.clone(); - tmp.text = reversed; + tmp.text = Self::reverse_rtl_keeping_marks(&span.text); Self::push_span_text(out, &tmp); } else { Self::push_span_text(out, span); } } + /// Reverse a pure-RTL run from visual to logical order while keeping each + /// Arabic/Hebrew combining mark attached to its base letter (#656). + /// + /// A naive `chars().rev()` reverses by Unicode scalar value, so a base + /// letter's diacritics (which follow it in logical order — kasra/shadda + /// U+0650/U+0651, Hebrew points U+05B0..) jump *in front* of the base and + /// float off as standalone marks. Grouping each base char with the + /// combining marks that trail it, then reversing the group order (each + /// group's internal order preserved), keeps marks bound to their base. + fn reverse_rtl_keeping_marks(text: &str) -> String { + use crate::text::rtl_detector::is_rtl_diacritic; + let mut groups: Vec> = Vec::new(); + for c in text.chars() { + if is_rtl_diacritic(c as u32) && !groups.is_empty() { + groups.last_mut().unwrap().push(c); + } else { + groups.push(vec![c]); + } + } + groups.iter().rev().flatten().collect() + } + /// Parse font size from a /DA (Default Appearance) string. /// /// DA strings follow the format: `"/FontName size Tf ..."` (e.g., `"/Helv 12 Tf 0 g"`). @@ -8626,17 +8647,35 @@ impl PdfDocument { /// span MCIDs and for any MCID containing RTL text (whose span order is /// handled by the bidi passes) — both stay byte-identical. fn order_mcid_spans(spans: &[crate::layout::TextSpan]) -> Vec<&crate::layout::TextSpan> { + use crate::text::rtl_detector::is_rtl_text; let mut ordered: Vec<&crate::layout::TextSpan> = spans.iter().collect(); - let has_rtl = |s: &crate::layout::TextSpan| { - s.text - .chars() - .any(|c| crate::text::rtl_detector::is_rtl_text(c as u32)) - }; - if spans.len() > 1 && !spans.iter().any(has_rtl) { + if spans.len() <= 1 { + return ordered; + } + let has_rtl = spans + .iter() + .any(|s| s.text.chars().any(|c| is_rtl_text(c as u32))); + let has_latin = spans + .iter() + .any(|s| s.text.chars().any(|c| c.is_ascii_alphabetic())); + if !has_rtl { + // LTR multi-span MCID: left-to-right row-aware reading order. ordered.sort_by(|a, b| { crate::utils::row_aware_span_cmp(a.bbox.y, a.bbox.x, b.bbox.y, b.bbox.x) }); + } else if !has_latin { + // #656/#657: pure-RTL MCID. The tagged struct-tree path never + // reaches `reverse_rtl_visual_order_runs`, so without an explicit + // span-order pass the words emerge in visual (reversed) sequence. + // Emitting each row right-to-left (X descending) reconstructs + // logical reading order from geometry, independent of whether the + // producer stored the run visually or logically. Per-span glyph + // order is corrected separately by `push_span_text_bidi`. + ordered.sort_by(|a, b| { + crate::utils::row_aware_span_cmp_rtl(a.bbox.y, a.bbox.x, b.bbox.y, b.bbox.x) + }); } + // Mixed RTL+Latin MCIDs keep raw order (full UAX #9 bidi deferred). ordered } @@ -9415,22 +9454,41 @@ impl PdfDocument { lower.starts_with("cm") || lower.contains("symbol") } - /// Replace a `¬` (U+00AC) that sits directly between two ASCII digits with - /// `.` (the decimal point a math subset drew from its `logicalnot` slot). - /// Leaves every other `¬` untouched. + /// Replace a `¬` (U+00AC) that a math subset drew from its `logicalnot` + /// slot as a decimal point. Two shapes are recovered: + /// + /// - `digit ¬ digit` → `digit.digit` (e.g. `1¬00` → `1.00`) + /// - `digit ¬ digit` → `digit.digit` (e.g. `1¬ 00` → `1.00`) + /// + /// The second form covers subsets that emit a single space between the + /// decimal glyph and the fractional digits; the lone separating space is + /// dropped so the number reads as one token. The leading digit must abut + /// `¬` directly in both shapes, so a genuinely spaced negation (`5 ¬ 3`, + /// `A ¬ B`) is left untouched. Every other `¬` is preserved. fn fix_digit_logicalnot_decimal(text: &str) -> String { let chars: Vec = text.chars().collect(); let mut out = String::with_capacity(text.len()); - for (i, &c) in chars.iter().enumerate() { - if c == '\u{00AC}' - && i > 0 - && chars[i - 1].is_ascii_digit() - && chars.get(i + 1).is_some_and(|n| n.is_ascii_digit()) - { - out.push('.'); - } else { - out.push(c); + let mut i = 0; + while i < chars.len() { + let c = chars[i]; + if c == '\u{00AC}' && i > 0 && chars[i - 1].is_ascii_digit() { + // Unspaced: digit ¬ digit. + if chars.get(i + 1).is_some_and(|n| n.is_ascii_digit()) { + out.push('.'); + i += 1; + continue; + } + // Spaced: digit ¬ digit — drop the lone space. + if chars.get(i + 1) == Some(&' ') + && chars.get(i + 2).is_some_and(|n| n.is_ascii_digit()) + { + out.push('.'); + i += 2; // skip the ¬ and the single separating space + continue; + } } + out.push(c); + i += 1; } out } @@ -15446,6 +15504,16 @@ impl PdfDocument { page_index: usize, options: &crate::converters::ConversionOptions, ) -> Result { + // Encrypted-and-undecryptable parity: extract_text / to_markdown / to_html + // all short-circuit to an empty string here (ISO 32000-1:2008 §7.6); the + // geometric plain-text path below would also yield empty (no decryptable + // content) but went through the full pipeline first. Guard explicitly so + // every text surface returns the same empty result on the same input. + if self.is_encrypted_unreadable() { + log::warn!("PDF is encrypted and could not be decrypted; returning empty text"); + return Ok(String::new()); + } + // #608: for a trustworthy tagged PDF, read in logical structure order // (§14.8.2.3.1) by assembling directly from the structure tree — the // same path `extract_text` uses. The geometric plain-text converter @@ -19459,6 +19527,65 @@ mod tests { ); } + // #656/#657: the tagged struct-tree path collapses a page into one MCID + // whose pure-RTL word-spans are laid out left-to-right (visual, X + // ascending). `order_mcid_spans` must emit them right-to-left (logical) + // using geometry, since the tagged path never reaches the untagged + // `reverse_rtl_visual_order_runs`. (Per-span glyph order is handled + // separately by `push_span_text_bidi`; this test asserts span ORDER.) + #[test] + fn test_order_mcid_spans_pure_rtl_emitted_right_to_left() { + // One Hebrew row, three words placed left-to-right by X. + let spans = vec![ + make_rtl_test_span("שלוש", 100.0, 700.0), // leftmost → logically last + make_rtl_test_span("שתיים", 200.0, 700.0), + make_rtl_test_span("אחת", 300.0, 700.0), // rightmost → logically first + ]; + let ordered = PdfDocument::order_mcid_spans(&spans); + let texts: Vec<&str> = ordered.iter().map(|s| s.text.as_str()).collect(); + assert_eq!( + texts, + vec!["אחת", "שתיים", "שלוש"], + "pure-RTL MCID spans must emit rightmost-first (logical RTL order), got {texts:?}" + ); + } + + // #656: grapheme-aware RTL reversal keeps Arabic combining marks bound to + // their base letter (vs. a naive chars().rev() that floats them off). + #[test] + fn test_reverse_rtl_keeping_marks_keeps_diacritics_attached() { + // قِطّ = QAF + KASRA(U+0650) + TAH + SHADDA(U+0651). Reversing must + // keep each mark immediately after its base, not lead the string. + let src = "\u{0642}\u{0650}\u{0637}\u{0651}"; // قِطّ + let out = PdfDocument::reverse_rtl_keeping_marks(src); + // Expected: base order reversed (TAH+SHADDA group, then QAF+KASRA group). + assert_eq!(out, "\u{0637}\u{0651}\u{0642}\u{0650}"); + // No combining mark ever leads a base it doesn't belong to: every + // diacritic is immediately preceded by a non-diacritic. + let chars: Vec = out.chars().collect(); + for (i, c) in chars.iter().enumerate() { + if crate::text::rtl_detector::is_rtl_diacritic(*c as u32) { + assert!( + i > 0 && !crate::text::rtl_detector::is_rtl_diacritic(chars[i - 1] as u32), + "diacritic at {i} is detached from its base" + ); + } + } + } + + // Mixed RTL+Latin MCIDs are left in raw order (full UAX #9 deferred) — + // guards against the pure-RTL reorder accidentally firing on mixed runs. + #[test] + fn test_order_mcid_spans_mixed_rtl_latin_kept_raw() { + let spans = vec![ + make_rtl_test_span("שלום", 100.0, 700.0), + make_rtl_test_span("World", 200.0, 700.0), + ]; + let ordered = PdfDocument::order_mcid_spans(&spans); + let texts: Vec<&str> = ordered.iter().map(|s| s.text.as_str()).collect(); + assert_eq!(texts, vec!["שלום", "World"], "mixed RTL+Latin must stay in raw order"); + } + // #553: bare page-number detection (applied only inside the margin band). #[test] fn test_is_bare_page_number_text() { @@ -22669,6 +22796,20 @@ mod tests { assert_eq!(PdfDocument::fix_digit_logicalnot_decimal("5 \u{00AC} 3"), "5 \u{00AC} 3"); // Leading/trailing `¬` with only one digit neighbour: untouched. assert_eq!(PdfDocument::fix_digit_logicalnot_decimal("\u{00AC}5"), "\u{00AC}5"); + // Spaced decimal: a subset that emits a single space between the decimal + // glyph and the fractional digits → drop the lone space, recover `.`. + assert_eq!(PdfDocument::fix_digit_logicalnot_decimal("1\u{00AC} 00"), "1.00"); + assert_eq!( + PdfDocument::fix_digit_logicalnot_decimal("0\u{00AC} 75 1\u{00AC} 00"), + "0.75 1.00" + ); + // Still NOT a decimal when the leading digit does not abut `¬` + // (genuine spaced negation): `5 ¬ 3` stays untouched even though a + // digit follows the space. + assert_eq!(PdfDocument::fix_digit_logicalnot_decimal("5 \u{00AC} 3"), "5 \u{00AC} 3"); + // Only a single separating space is absorbed; two spaces is not a + // decimal rendering and is left alone. + assert_eq!(PdfDocument::fix_digit_logicalnot_decimal("1\u{00AC} 00"), "1\u{00AC} 00"); } #[test] diff --git a/src/editor/document_editor.rs b/src/editor/document_editor.rs index 29c0b2fca..4245686db 100644 --- a/src/editor/document_editor.rs +++ b/src/editor/document_editor.rs @@ -2007,6 +2007,23 @@ impl DocumentEditor { // value. Also sets /NeedAppearances on the existing AcroForm. self.flush_form_fields_to_modified_objects()?; + // #647: for an *inline* AcroForm the flush above sets + // /NeedAppearances true by re-writing the catalog into + // `modified_objects`, but `catalog_obj` was snapshotted before the + // flush — so without this the patch is silently dropped and viewers + // honour the field's pre-existing empty /AP /N over /DA, rendering + // the field blank (ISO 32000-1 §12.7.3.3, Table 226 — AP takes + // precedence over DA). Re-adopt the patched catalog. Guarded to the + // pure-fill case: the flatten branches above already rebuild the + // catalog's AcroForm in-place (and must not be clobbered), and the + // new-fields branch below emits an AcroForm with NeedAppearances via + // AcroFormBuilder. + if !self.remove_acroform && self.flatten_forms_pages.is_empty() { + if let Some(patched) = self.modified_objects.get(&catalog_ref.id) { + catalog_obj = patched.clone(); + } + } + // Only genuinely NEW fields need freshly-allocated objects plus // /Fields and /Annots entries. Existing fields keep their object // ids and their place in /Fields/Annots (updated in place above). diff --git a/src/extractors/text.rs b/src/extractors/text.rs index b6034a1f4..f44d32508 100644 --- a/src/extractors/text.rs +++ b/src/extractors/text.rs @@ -1052,6 +1052,23 @@ fn should_insert_space( return SpaceDecision::no_space(SpaceSource::AlreadyPresent, 1.0); } + // Rule 0.3: Complex-script combining-mark guard (#656-class Indic gap). + // A Brahmic/Thai/Khmer dependent vowel sign, virama, or tone mark followed + // by another character of a complex script is intra-word — the mark carries + // its own advance, so the geometric gap and consensus paths below would + // otherwise emit a spurious word space (the dominant matra→consonant error + // for Tamil/Bengali/Devanagari). Genuine word breaks carry an explicit + // space glyph, already handled by Rule 0. This guards the strong-geometric + // and consensus branches, which never consult `WordBoundaryDetector`. + if let (Some(pc), Some(nc)) = + (preceding_text.chars().next_back(), following_text.chars().next()) + { + use crate::text::complex_script_detector::{detect_complex_script, is_complex_script_mark}; + if is_complex_script_mark(pc as u32) && detect_complex_script(nc as u32).is_some() { + return SpaceDecision::no_space(SpaceSource::NoSpace, 0.9); + } + } + // Rule 0.4: Emoji / pictographic → letter boundary. // A wide pictographic glyph (e.g. 📄) advances far, so the residual gap to // the next token falls below the proportional-font space threshold and the @@ -4446,6 +4463,27 @@ impl<'doc> TextExtractor<'doc> { current.bbox.width = new_width; current.bbox.height = new_height; + // Keep `char_widths` in lockstep with the merged text. The + // downstream width-based splitters `is_column_spanning_decimal` + // and `char_widths_boundary_split` (document.rs) fire when + // `char_widths.len() < char_count`, so a merged multi-glyph span + // (e.g. per-glyph `Td Tj` table cells like "0.99" / "Q1") + // would otherwise be wrongly split — dropping the decimal point + // ("0.99" → "0 99") or gluing a space at the letter→digit + // boundary ("Q1" → "Q 1"). Append this span's per-glyph widths, + // then pad to the exact char count to cover any inserted '.'/ + // ' ' separator (or a source span whose widths were sparse). + current.char_widths.extend_from_slice(&span.char_widths); + let merged_char_count = current.text.chars().count(); + if current.char_widths.len() != merged_char_count { + let pad = if current.font_size > 0.0 { + current.font_size * 0.25 + } else { + 1.0 + }; + current.char_widths.resize(merged_char_count, pad); + } + // After a cross-font glue, adopt the longer run's font // metadata. The single-letter side was typographic // decoration, not semantic emphasis, so the dominant-run diff --git a/src/fonts/font_dict.rs b/src/fonts/font_dict.rs index d722d34d9..9927ae86e 100644 --- a/src/fonts/font_dict.rs +++ b/src/fonts/font_dict.rs @@ -3275,8 +3275,21 @@ impl FontInfo { /// The width of the space character (code 0x20) in 1000ths of em, /// or the font's default width if the space glyph is not defined. pub fn get_space_glyph_width(&self) -> f32 { - // Space character is always code 0x20 (32) in PDF - self.get_glyph_width(0x20) + // Space character is always code 0x20 (32) in PDF. + let w = self.get_glyph_width(0x20); + // Many CID-keyed subset fonts (notably shaped Arabic from Chrome / + // browser print) omit a glyph for code 0x20 entirely, so this returns + // ~0. Callers derive their geometric word-gap threshold from this + // width (threshold = space_width × ratio); a zero width collapses the + // threshold to 0, so *every* inter-glyph kerning gap is read as a word + // boundary and cursive Arabic words shatter into single letters (#656). + // Fall back to a typographic default of 0.25 em (250 font units) — the + // same value `should_insert_space` uses when the font is absent. + if w < 50.0 { + 250.0 + } else { + w + } } /// Map a Glyph ID (GID) to a standard PostScript glyph name. diff --git a/src/lib.rs b/src/lib.rs index b5d2f152a..171fee084 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -423,6 +423,30 @@ pub(crate) mod utils { } } + /// Right-to-left variant of [`row_aware_span_cmp`] (issues #656/#657). + /// + /// Identical row banding (lines top-to-bottom), but orders spans + /// **right-to-left within a row** (X descending). A pure-RTL line's + /// logical reading order *is* its rightmost-first geometric order, so + /// sorting word-spans by descending X reconstructs logical order + /// directly from page geometry — independent of whether the producer + /// stored the run in visual or logical order. Used by the tagged + /// struct-tree assemblers, which otherwise have no span-order pass for + /// RTL (the untagged `reverse_rtl_visual_order_runs` is never reached + /// on tagged pages). + #[inline] + pub fn row_aware_span_cmp_rtl(a_y: f32, a_x: f32, b_y: f32, b_x: f32) -> Ordering { + if !a_y.is_finite() || !b_y.is_finite() { + return safe_float_cmp(b_y, a_y).then_with(|| safe_float_cmp(b_x, a_x)); + } + let band_a = (a_y / ROW_BAND_TOLERANCE_PT).round() as i32; + let band_b = (b_y / ROW_BAND_TOLERANCE_PT).round() as i32; + match band_b.cmp(&band_a) { + Ordering::Equal => safe_float_cmp(b_x, a_x), // X descending = RTL + other => other, + } + } + /// Safely compare two floating point numbers, handling NaN cases. /// /// NaN values are treated as equal to each other and greater than all other values. @@ -657,6 +681,43 @@ pub(crate) mod utils { v.sort_by(|a, b| row_aware_span_cmp(a.0, a.1, b.0, b.1)); } + /// #656/#657: the RTL variant keeps rows top-to-bottom but orders + /// X *descending* (right-to-left) within a row — a pure-RTL line's + /// logical reading order. + #[test] + fn test_row_aware_span_cmp_rtl_within_row_is_descending() { + // Same row (Y within band), laid out left-to-right by X. + let mut row = [ + (100.0f32, 10.0f32, "leftmost"), + (100.0, 50.0, "mid"), + (100.0, 90.0, "rightmost"), + ]; + row.sort_by(|a, b| row_aware_span_cmp_rtl(a.0, a.1, b.0, b.1)); + // Rightmost (highest X) reads first in RTL. + assert_eq!(["rightmost", "mid", "leftmost"], [row[0].2, row[1].2, row[2].2]); + } + + /// Rows still order top-to-bottom regardless of the within-row flip. + #[test] + fn test_row_aware_span_cmp_rtl_rows_top_to_bottom() { + let mut rows = [ + (10.0f32, 0.0f32, "bottom"), + (100.0, 0.0, "top"), + (50.0, 0.0, "middle"), + ]; + rows.sort_by(|a, b| row_aware_span_cmp_rtl(a.0, a.1, b.0, b.1)); + assert_eq!(["top", "middle", "bottom"], [rows[0].2, rows[1].2, rows[2].2]); + } + + /// Must be a valid total order for `sort_by` (no transitivity panic). + #[test] + fn test_row_aware_span_cmp_rtl_is_total_order() { + let mut v: Vec<(f32, f32)> = (0..200) + .map(|i| ((i as f32) * 0.73, ((i * 17) % 500) as f32)) + .collect(); + v.sort_by(|a, b| row_aware_span_cmp_rtl(a.0, a.1, b.0, b.1)); + } + /// Sort a large array with mixed NaN/normal values to stress-test. #[test] fn test_sort_stress_with_nan() { diff --git a/src/pipeline/ordered_span.rs b/src/pipeline/ordered_span.rs index 56b191379..fc0efebf4 100644 --- a/src/pipeline/ordered_span.rs +++ b/src/pipeline/ordered_span.rs @@ -20,6 +20,11 @@ pub enum ReadingOrderSource { /// /// Confidence: 0.90 (robust for multi-column layouts). XYCut, + /// Order from article threads (`/Threads`, ISO 32000-1:2008 §12.4.3). + /// + /// Confidence: 0.95 (author-supplied explicit reading order; ranks below + /// the structure tree but above geometric inference). + ArticleThread, /// Order from geometric column analysis. /// /// Confidence: 0.85 (good for standard column layouts). @@ -44,6 +49,7 @@ impl ReadingOrderSource { pub fn default_confidence(&self) -> f32 { match self { ReadingOrderSource::StructureTree => 1.0, + ReadingOrderSource::ArticleThread => 0.95, ReadingOrderSource::XYCut => 0.90, ReadingOrderSource::Geometric => 0.85, ReadingOrderSource::Simple => 0.75, @@ -56,6 +62,7 @@ impl ReadingOrderSource { pub fn name(&self) -> &'static str { match self { ReadingOrderSource::StructureTree => "StructureTree", + ReadingOrderSource::ArticleThread => "ArticleThread", ReadingOrderSource::XYCut => "XYCut", ReadingOrderSource::Geometric => "Geometric", ReadingOrderSource::Simple => "Simple", @@ -104,6 +111,11 @@ impl ReadingOrderInfo { Self::from_source(ReadingOrderSource::XYCut) } + /// Create for article-thread source (`/Threads`, §12.4.3). + pub fn article_thread() -> Self { + Self::from_source(ReadingOrderSource::ArticleThread) + } + /// Create for geometric source. pub fn geometric() -> Self { Self::from_source(ReadingOrderSource::Geometric) diff --git a/src/pipeline/page_order.rs b/src/pipeline/page_order.rs index c02a045a5..aeb08dd05 100644 --- a/src/pipeline/page_order.rs +++ b/src/pipeline/page_order.rs @@ -69,7 +69,17 @@ fn page_reading_order_inner( return Ok(Vec::new()); } + // Article threads (#458): the parser (`crate::structure::parse_article_threads`) + // and `ArticleThreadStrategy` ship as a tested foundation, but are NOT yet + // auto-wired into this default path. The v0.3.61 corpus sweep showed the + // ≥80%-bead-coverage gate activated on regular technical books (single-column, + // where geometric order is already correct) and reordered content + // non-improvingly. Until the activation gate can be proven to *only improve* + // on true multi-column magazine threads (deferred → v0.3.62), the default + // reading order stays geometric so the corpus is byte-identical. Callers can + // still use the parser/strategy directly. let context = build_context(doc, page_index); + let pipeline = TextPipeline::with_config(TextPipelineConfig::default()); pipeline.process(spans, context) } diff --git a/src/pipeline/reading_order/article_thread.rs b/src/pipeline/reading_order/article_thread.rs new file mode 100644 index 000000000..d57e1b7d8 --- /dev/null +++ b/src/pipeline/reading_order/article_thread.rs @@ -0,0 +1,201 @@ +//! Article-thread reading order strategy (ISO 32000-1:2008 §12.4.3). +//! +//! When a page is governed by article-thread beads (`/Threads`), spans are read +//! by walking the beads in their chain (`/N`) order: all spans whose centre +//! falls inside a bead are emitted together, ordered top-to-bottom/left-to-right +//! within the bead. Spans captured by no bead are appended via the geometric +//! fallback so nothing is dropped (the "partial coverage" case). +//! +//! This strategy is only selected when [`ReadingOrderContext::bead_rects`] is +//! populated — which the canonical [`crate::pipeline::page_order`] helper does +//! only for non-tagged pages whose beads cover ≥80% of the page text. With no +//! bead rects the geometric path runs unchanged (fails closed). + +use crate::error::Result; +use crate::geometry::{Point, Rect}; +use crate::layout::TextSpan; +use crate::pipeline::{OrderedTextSpan, ReadingOrderInfo}; + +use super::{ReadingOrderContext, ReadingOrderStrategy, XYCutStrategy}; + +/// Article-thread (`/Threads`) reading order strategy. +pub struct ArticleThreadStrategy { + /// Fallback for spans not captured by any bead, and for the no-bead case. + fallback: XYCutStrategy, +} + +impl ArticleThreadStrategy { + /// Construct a new strategy with a default XY-cut fallback. + pub fn new() -> Self { + Self { + fallback: XYCutStrategy::new(), + } + } +} + +impl Default for ArticleThreadStrategy { + fn default() -> Self { + Self::new() + } +} + +/// Centre point of a span's bounding box. +fn span_center(span: &TextSpan) -> Point { + Point { + x: span.bbox.x + span.bbox.width * 0.5, + y: span.bbox.y + span.bbox.height * 0.5, + } +} + +/// Sort indices into `spans` top-to-bottom (Y descending), then left-to-right +/// (X ascending) — matching `SimpleStrategy`'s convention for a single region. +fn sort_reading_within_region(indices: &mut [usize], spans: &[TextSpan]) { + indices.sort_by(|&a, &b| { + let y = crate::utils::safe_float_cmp(spans[b].bbox.y, spans[a].bbox.y); + if y != std::cmp::Ordering::Equal { + return y; + } + crate::utils::safe_float_cmp(spans[a].bbox.x, spans[b].bbox.x) + }); +} + +impl ReadingOrderStrategy for ArticleThreadStrategy { + fn apply( + &self, + spans: Vec, + context: &ReadingOrderContext, + ) -> Result> { + // No bead rects → behave exactly like the geometric fallback. + let beads: &[Rect] = match &context.bead_rects { + Some(b) if !b.is_empty() => b, + _ => return self.fallback.apply(spans, context), + }; + + // Assign each span to the first bead (in chain order) that contains its + // centre; spans matching no bead are left for the geometric fallback. + let mut per_bead: Vec> = vec![Vec::new(); beads.len()]; + let mut leftover: Vec = Vec::new(); + for (i, span) in spans.iter().enumerate() { + let c = span_center(span); + match beads.iter().position(|r| r.contains_point(&c)) { + Some(b) => per_bead[b].push(i), + None => leftover.push(i), + } + } + + // Within each bead, order spans geometrically; emit beads in chain order. + let mut order_for_index: Vec> = vec![None; spans.len()]; + let mut next_order = 0usize; + for bead_indices in per_bead.iter_mut() { + sort_reading_within_region(bead_indices, &spans); + for &i in bead_indices.iter() { + order_for_index[i] = Some(next_order); + next_order += 1; + } + } + + // Build the captured output (beads), tagged as ArticleThread. + let mut captured: Vec<(usize, TextSpan)> = Vec::new(); + // Leftover spans are ordered by the geometric fallback and appended + // after all bead content, preserving their relative geometric order. + let leftover_spans: Vec = leftover.iter().map(|&i| spans[i].clone()).collect(); + + for (i, span) in spans.into_iter().enumerate() { + if let Some(order) = order_for_index[i] { + captured.push((order, span)); + } + } + captured.sort_by_key(|(order, _)| *order); + + let mut result: Vec = captured + .into_iter() + .map(|(order, span)| { + OrderedTextSpan::with_info(span, order, ReadingOrderInfo::article_thread()) + }) + .collect(); + + if !leftover_spans.is_empty() { + let tail = self.fallback.apply(leftover_spans, context)?; + let base = result.len(); + for (k, mut o) in tail.into_iter().enumerate() { + o.reading_order = base + k; + result.push(o); + } + } + + Ok(result) + } + + fn name(&self) -> &'static str { + "ArticleThreadStrategy" + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::geometry::Rect; + use crate::layout::TextSpan; + use crate::pipeline::ReadingOrderSource; + + fn span(text: &str, x: f32, y: f32) -> TextSpan { + TextSpan { + text: text.to_string(), + bbox: Rect::new(x, y, 20.0, 10.0), + font_size: 10.0, + ..TextSpan::default() + } + } + + fn texts(ordered: &[OrderedTextSpan]) -> Vec { + ordered.iter().map(|o| o.span.text.clone()).collect() + } + + #[test] + fn two_column_beads_read_left_column_then_right() { + // Two bead columns: left bead (x 0..100), right bead (x 200..300). + // Spans are supplied in a scrambled order; thread order must be the + // left column top-to-bottom, then the right column top-to-bottom. + let spans = vec![ + span("R-top", 210.0, 500.0), + span("L-bot", 10.0, 400.0), + span("R-bot", 210.0, 400.0), + span("L-top", 10.0, 500.0), + ]; + let ctx = ReadingOrderContext::new().with_bead_rects(vec![ + Rect::from_points(0.0, 380.0, 100.0, 520.0), // left column + Rect::from_points(200.0, 380.0, 300.0, 520.0), // right column + ]); + + let ordered = ArticleThreadStrategy::new().apply(spans, &ctx).unwrap(); + assert_eq!(texts(&ordered), vec!["L-top", "L-bot", "R-top", "R-bot"]); + assert!(ordered + .iter() + .all(|o| o.order_info.source == ReadingOrderSource::ArticleThread)); + } + + #[test] + fn spans_outside_all_beads_are_appended_not_dropped() { + let spans = vec![ + span("in-bead", 10.0, 500.0), + span("orphan", 400.0, 100.0), // outside every bead + ]; + let ctx = ReadingOrderContext::new() + .with_bead_rects(vec![Rect::from_points(0.0, 480.0, 100.0, 520.0)]); + + let ordered = ArticleThreadStrategy::new().apply(spans, &ctx).unwrap(); + let t = texts(&ordered); + assert_eq!(t.len(), 2, "no span may be dropped"); + assert_eq!(t[0], "in-bead", "bead content comes first"); + assert!(t.contains(&"orphan".to_string()), "orphan must be appended"); + } + + #[test] + fn no_bead_rects_falls_back_to_geometric() { + // Empty/absent bead rects → identical to the geometric fallback. + let spans = vec![span("a", 10.0, 500.0), span("b", 10.0, 400.0)]; + let ctx = ReadingOrderContext::new(); + let ordered = ArticleThreadStrategy::new().apply(spans, &ctx).unwrap(); + assert_eq!(ordered.len(), 2); + } +} diff --git a/src/pipeline/reading_order/mod.rs b/src/pipeline/reading_order/mod.rs index f32095d52..2d2154613 100644 --- a/src/pipeline/reading_order/mod.rs +++ b/src/pipeline/reading_order/mod.rs @@ -10,6 +10,7 @@ //! - [`XYCutStrategy`]: Recursive XY-Cut spatial partitioning (newspapers, academic papers) //! - [`SimpleStrategy`]: Simple top-to-bottom, left-to-right ordering +pub mod article_thread; pub mod detectors; pub mod geometric; pub mod simple; @@ -17,6 +18,7 @@ pub mod structure_tree; pub mod tategaki; pub mod xycut; +pub use article_thread::ArticleThreadStrategy; pub use detectors::{ classify_region, detect_dense_single_line, detect_dramatic_script, detect_narrow_tracked, detect_sub_super_glyphs, DetectorGlyph, ReadingOrderClass, @@ -77,6 +79,15 @@ pub struct ReadingOrderContext { /// MCID to reading order mapping (if structure tree available). pub mcid_order: Option>, + /// Ordered article-thread bead rectangles for this page, in `/N` order + /// (ISO 32000-1:2008 §12.4.3). Set only when the document declares + /// `/Threads`, no trustworthy structure tree governs the page, and the + /// beads cover enough of the page text to be trusted. When present, the + /// reading-order strategy threads spans through these regions instead of + /// using pure geometric inference. `None` ⇒ the geometric path is unchanged + /// (fails closed). + pub bead_rects: Option>, + /// Whether the structure tree contains suspect (unreliable) content. /// /// Per ISO 32000-1:2008 Section 14.7.1, when this is true, the structure @@ -118,6 +129,12 @@ impl ReadingOrderContext { self.suspects = suspects; self } + + /// Set the ordered article-thread bead rectangles for this page. + pub fn with_bead_rects(mut self, bead_rects: Vec) -> Self { + self.bead_rects = Some(bead_rects); + self + } } /// Create a reading order strategy based on configuration. diff --git a/src/pipeline/reading_order/structure_tree.rs b/src/pipeline/reading_order/structure_tree.rs index a43e5f6cf..187a66290 100644 --- a/src/pipeline/reading_order/structure_tree.rs +++ b/src/pipeline/reading_order/structure_tree.rs @@ -7,7 +7,7 @@ use crate::error::Result; use crate::layout::TextSpan; use crate::pipeline::{OrderedTextSpan, ReadingOrderInfo}; -use super::{ReadingOrderContext, ReadingOrderStrategy, XYCutStrategy}; +use super::{ArticleThreadStrategy, ReadingOrderContext, ReadingOrderStrategy, XYCutStrategy}; /// Structure tree-based reading order strategy. /// @@ -30,6 +30,23 @@ impl StructureTreeStrategy { fallback: XYCutStrategy::new(), } } + + /// Order spans when the structure tree cannot be trusted for this page. + /// + /// Prefers article-thread order (`/Threads`, §12.4.3) when the canonical + /// helper supplied bead rectangles for the page; otherwise uses the + /// geometric XY-cut fallback. With no bead rects this is exactly the prior + /// behaviour (fails closed). + fn fallback_order( + &self, + spans: Vec, + context: &ReadingOrderContext, + ) -> Result> { + if context.bead_rects.as_ref().is_some_and(|b| !b.is_empty()) { + return ArticleThreadStrategy::new().apply(spans, context); + } + self.fallback.apply(spans, context) + } } /// Detect whether applying `mcid_order` to `spans` would produce a @@ -119,13 +136,14 @@ impl ReadingOrderStrategy for StructureTreeStrategy { // tree may contain errors or unreliable content. if context.suspects { log::debug!("Structure tree marked as suspect, falling back to geometric ordering"); - return self.fallback.apply(spans, context); + return self.fallback_order(spans, context); } - // If no structure tree or MCID order, fall back to geometric strategy + // If no structure tree or MCID order, fall back (article threads first, + // then geometric) — see `fallback_order`. let mcid_order = match &context.mcid_order { Some(order) if !order.is_empty() => order, - _ => return self.fallback.apply(spans, context), + _ => return self.fallback_order(spans, context), }; // Trust-check: if the MCID ordering would zigzag horizontally @@ -135,7 +153,7 @@ impl ReadingOrderStrategy for StructureTreeStrategy { // respecting column visual order). Fall back to geometric. if mcid_order_zigzags_columns(&spans, mcid_order) { log::debug!("MCID order zigzags across columns, falling back to geometric ordering"); - return self.fallback.apply(spans, context); + return self.fallback_order(spans, context); } // Create MCID -> reading order mapping diff --git a/src/pipeline/reading_order/xycut.rs b/src/pipeline/reading_order/xycut.rs index 2675067d5..588de4f6c 100644 --- a/src/pipeline/reading_order/xycut.rs +++ b/src/pipeline/reading_order/xycut.rs @@ -1591,7 +1591,9 @@ impl XYCutStrategy { let half = smooth_window / 2; // Smooth into a reused thread-local buffer instead of a fresh `Vec` per - // failed-valley node. Window-mean is unchanged. + // failed-valley node. Window-mean is unchanged. (Confirmed not a source + // of the p.692 non-determinism: the buffer is cleared+refilled to exactly + // `n` each call and never read out of range.) thread_local! { static SMOOTH_SCRATCH: std::cell::RefCell> = const { std::cell::RefCell::new(Vec::new()) }; diff --git a/src/structure/article_threads.rs b/src/structure/article_threads.rs new file mode 100644 index 000000000..5d1121150 --- /dev/null +++ b/src/structure/article_threads.rs @@ -0,0 +1,172 @@ +//! Article threads (`/Threads`) — ISO 32000-1:2008 §12.4.3. +//! +//! Article threads are an author-supplied explicit reading order that chains +//! logically-connected content ("beads") across columns and pages. They are +//! the canonical reading-order signal for untagged legacy magazine / multi- +//! column PDFs, predating the structure tree. +//! +//! Data model (Tables 160 / 161): +//! * Catalog `/Threads` → array of indirect refs to **thread dictionaries**. +//! * Thread dict: `/F` (required) → first **bead**; `/I` (optional) thread info. +//! * Bead dict: `/N` next bead, `/V` prev bead, `/P` page object, `/R` rect +//! `[llx lly urx ury]`. Beads form a **circular doubly-linked list** (the last +//! bead's `/N` points back to the first). +//! +//! This module only *parses* threads into page-local bead rectangles; the +//! reading-order integration lives in +//! [`crate::pipeline::reading_order::ArticleThreadStrategy`]. + +use std::collections::HashMap; + +use crate::document::PdfDocument; +use crate::geometry::Rect; +use crate::object::{Object, ObjectRef}; + +/// One bead: a rectangular region on a specific page, in PDF user space. +#[derive(Debug, Clone, PartialEq)] +pub struct Bead { + /// 0-based index of the page this bead sits on. + pub page_index: usize, + /// Bead rectangle (`/R`) in the page's default user space. + pub rect: Rect, +} + +/// One article thread: an ordered chain of beads (in `/N` order). +#[derive(Debug, Clone, PartialEq)] +pub struct ArticleThread { + /// Optional thread title (`/I /Title`). + pub title: Option, + /// Beads in reading (`/N`) order. + pub beads: Vec, +} + +/// Upper bound on the bead chain length — a defence against malformed, +/// non-circular `/N` chains produced by buggy generators. +const MAX_BEADS_PER_THREAD: usize = 4096; + +/// Resolve `obj` to a concrete object, following a single indirect reference. +fn resolve(doc: &PdfDocument, obj: &Object) -> Option { + match obj.as_reference() { + Some(r) => doc.load_object(r).ok(), + None => Some(obj.clone()), + } +} + +/// Parse a `/R` rectangle array `[llx lly urx ury]` into a [`Rect`] in user space. +fn parse_rect(arr: &[Object]) -> Option { + if arr.len() != 4 { + return None; + } + let n = |o: &Object| -> Option { + o.as_real() + .map(|v| v as f32) + .or_else(|| o.as_integer().map(|v| v as f32)) + }; + let (llx, lly, urx, ury) = (n(&arr[0])?, n(&arr[1])?, n(&arr[2])?, n(&arr[3])?); + Some(Rect::from_points(llx, lly, urx, ury)) +} + +/// Parse all article threads declared in the document catalog's `/Threads`. +/// +/// Best-effort and panic-free: malformed threads/beads are skipped, dangling +/// references are tolerated, and non-circular `/N` chains are bounded by +/// `MAX_BEADS_PER_THREAD`. Returns an empty vector when the document declares +/// no threads. +pub fn parse_article_threads(doc: &PdfDocument) -> Vec { + let Ok(catalog) = doc.catalog() else { + return Vec::new(); + }; + let Some(catalog_dict) = catalog.as_dict() else { + return Vec::new(); + }; + let Some(threads_obj) = catalog_dict.get("Threads") else { + return Vec::new(); + }; + let Some(threads_resolved) = resolve(doc, threads_obj) else { + return Vec::new(); + }; + let Some(threads_arr) = threads_resolved.as_array() else { + return Vec::new(); + }; + + // Map page ObjectRef -> 0-based page index for resolving each bead's /P. + let page_index: HashMap = doc + .all_page_refs() + .unwrap_or_default() + .into_iter() + .enumerate() + .map(|(i, r)| (r, i)) + .collect(); + + let mut threads = Vec::new(); + for thread_ref in threads_arr { + if let Some(thread) = parse_one_thread(doc, thread_ref, &page_index) { + if !thread.beads.is_empty() { + threads.push(thread); + } + } + } + threads +} + +fn parse_one_thread( + doc: &PdfDocument, + thread_obj: &Object, + page_index: &HashMap, +) -> Option { + let thread = resolve(doc, thread_obj)?; + let thread_dict = thread.as_dict()?; + + let title = thread_dict + .get("I") + .and_then(|i| resolve(doc, i)) + .and_then(|info| info.as_dict()?.get("Title").and_then(string_value)); + + // First bead is required (/F). Walk /N until we loop back to it. + let first_ref = thread_dict.get("F")?.as_reference()?; + let mut beads = Vec::new(); + let mut seen = std::collections::HashSet::new(); + let mut cur = Some(first_ref); + + while let Some(bead_ref) = cur { + if !seen.insert(bead_ref) || beads.len() >= MAX_BEADS_PER_THREAD { + break; // circular wrap (normal terminator) or runaway chain + } + let Ok(bead_obj) = doc.load_object(bead_ref) else { + break; + }; + let Some(bead_dict) = bead_obj.as_dict() else { + break; + }; + + if let Some(bead) = parse_bead(bead_dict, page_index) { + beads.push(bead); + } + + // Advance to /N (next bead). Absent /N ends the chain. + cur = bead_dict.get("N").and_then(|n| n.as_reference()); + } + + Some(ArticleThread { title, beads }) +} + +fn parse_bead( + bead_dict: &HashMap, + page_index: &HashMap, +) -> Option { + let page_ref = bead_dict.get("P")?.as_reference()?; + let idx = *page_index.get(&page_ref)?; + let rect = parse_rect(bead_dict.get("R")?.as_array()?)?; + Some(Bead { + page_index: idx, + rect, + }) +} + +/// Decode a PDF text string object into a Rust `String` (best-effort). +fn string_value(obj: &Object) -> Option { + match obj { + Object::String(bytes) => Some(String::from_utf8_lossy(bytes).into_owned()), + _ => None, + } +} diff --git a/src/structure/mod.rs b/src/structure/mod.rs index a1b655082..d67ec5477 100644 --- a/src/structure/mod.rs +++ b/src/structure/mod.rs @@ -39,6 +39,7 @@ //! } //! ``` +pub mod article_threads; pub mod converter; mod parser; pub mod spatial_table_detector; @@ -46,6 +47,7 @@ pub mod table_extractor; pub mod traversal; pub mod types; +pub use article_threads::{parse_article_threads, ArticleThread, Bead}; pub use converter::StructureConverter; pub use parser::parse_structure_tree; pub use spatial_table_detector::{ diff --git a/src/structure/spatial_table_detector.rs b/src/structure/spatial_table_detector.rs index 110b14bca..833b48f04 100644 --- a/src/structure/spatial_table_detector.rs +++ b/src/structure/spatial_table_detector.rs @@ -1503,7 +1503,12 @@ fn group_lines_into_clusters( // Post-processing: split clusters whose vertical lines occupy distinct Y-ranges. // This prevents a small bordered table (e.g. an invoice header) from merging // with a large main table that happens to be nearby vertically. - let raw_clusters: Vec = cluster_map.into_values().collect(); + // Deterministic order: `cluster_map` is a HashMap (per-process-randomized + // iteration), so sort clusters by their first (smallest) line index — each + // cluster's `lines` Vec is already ascending — to keep downstream table + // boundary order stable across runs. + let mut raw_clusters: Vec = cluster_map.into_values().collect(); + raw_clusters.sort_by_key(|c| c.lines.first().copied().unwrap_or(usize::MAX)); let mut result: Vec = Vec::with_capacity(raw_clusters.len()); const LINE_AXIS_TOL: f32 = 2.0; let v_split_gap = config.v_split_gap; @@ -2014,7 +2019,15 @@ fn reconstitute_dotted_lines(edges: &mut Vec) { } } - for segments in dotted_groups.values() { + // Iterate in sorted key order: `dotted_groups` is a HashMap (per-process- + // randomized), and the reconstituted edges are appended to `long_edges` + // (which becomes `*edges`), so HashMap order would leak into edge order and, + // downstream, table-cell/region order. Sorting the snapped-coordinate keys + // makes it deterministic. + let mut dotted_keys: Vec = dotted_groups.keys().copied().collect(); + dotted_keys.sort_unstable(); + for key in dotted_keys { + let segments = &dotted_groups[&key]; if segments.len() >= DOTTED_MIN_SEGMENTS { let min_start = segments .iter() @@ -2215,11 +2228,27 @@ fn group_cells_into_tables(cells: &[IntersectionCell]) -> Vec> { let n = cells.len(); let mut uf = UnionFind::new(n); - // Two cells share an edge if they share two corners. - for i in 0..n { - for j in (i + 1)..n { - let ci = &cells[i]; + // Sweep-line prune for the O(n²) edge-adjacency scan (the hot loop on dense + // ruled pages — CFR regulatory megafiles, #26). BOTH adjacency tests below + // require the cells' y-extents to touch within SNAP_TOL: horizontal + // adjacency needs y1≈y1 (so cj.y1 ≤ ci.y2 + SNAP_TOL), and vertical + // adjacency needs cj.y1 ≈ ci.y2 (also ≤ ci.y2 + SNAP_TOL). Iterating cells + // in ascending-y1 order lets us `break` the inner loop once a candidate's + // y1 clears ci.y2 + SNAP_TOL — every later candidate has an even larger y1 + // and cannot share an edge. We `union` by ORIGINAL index, and union is + // order-independent, so the resulting partition is byte-identical to the + // full O(n²) scan; only provably-non-adjacent pairs are skipped. + let mut order: Vec = (0..n).collect(); + order.sort_by(|&a, &b| crate::utils::safe_float_cmp(cells[a].y1, cells[b].y1)); + for a in 0..n { + let i = order[a]; + let ci = &cells[i]; + let y_limit = ci.y2 + SNAP_TOL; + for &j in order.iter().skip(a + 1) { let cj = &cells[j]; + if cj.y1 > y_limit { + break; // sorted by y1 → no later cell can touch ci's y-extent + } let shares_edge = // Horizontal adjacency (share a vertical edge) (((ci.x2 - cj.x1).abs() <= SNAP_TOL || (ci.x1 - cj.x2).abs() <= SNAP_TOL) && (ci.y1 - cj.y1).abs() <= SNAP_TOL @@ -2234,8 +2263,16 @@ fn group_cells_into_tables(cells: &[IntersectionCell]) -> Vec> { } } - // Collect groups. - uf.groups().into_values().collect() + // Collect groups in a DETERMINISTIC order. `groups()` returns a HashMap + // whose iteration order is randomized per-process (Rust `RandomState`), so + // `into_values().collect()` would yield table clusters in a different order + // each run — leaking non-deterministic reading order on multi-table / figure + // pages (e.g. matrix-figure pages with several detected regions). Each + // group's `Vec` is already ascending (built `for i in 0..n`); sort the outer + // list by each group's first (smallest) cell index so table order is stable. + let mut groups: Vec> = uf.groups().into_values().collect(); + groups.sort_by_key(|g| g.first().copied().unwrap_or(usize::MAX)); + groups } /// Split table rows that contain text spans at multiple distinct Y positions into sub-rows. diff --git a/src/text/complex_script_detector.rs b/src/text/complex_script_detector.rs index cdf73b5b9..e865d512c 100644 --- a/src/text/complex_script_detector.rs +++ b/src/text/complex_script_detector.rs @@ -197,6 +197,14 @@ pub fn handle_devanagari_boundary( return Some(false); } + // Rule 6: No boundary AFTER a matra / dependent sign when the next glyph is + // another Devanagari character. matra→consonant is intra-word continuation; + // real word breaks carry an explicit space glyph. Mirrors `handle_indic_ + // boundary` Rule 3 (the dominant spurious-space direction). + if is_devanagari_diacritic(prev_code) && matches!(curr_code, 0x0900..=0x097F) { + return Some(false); + } + // Not a Devanagari-specific case - let other signals decide None } @@ -440,6 +448,16 @@ pub fn handle_indic_boundary(prev_char: &CharacterInfo, curr_char: &CharacterInf return Some(false); } + // Rule 3: No boundary AFTER a matra / dependent vowel sign / virama when the + // next glyph is another character of the same Brahmic script. A dependent + // vowel sign followed by a base consonant is always intra-word — real word + // breaks carry an explicit space glyph (U+0020), handled upstream. This is + // the dominant spurious-space direction (matra→consonant) that a purely + // geometric gap test inserts because the matra carries its own advance. + if is_indic_diacritic(prev_code) && detect_complex_script(curr_code).is_some() { + return Some(false); + } + // Not an Indic-specific case - let other signals decide None } diff --git a/src/text/word_boundary.rs b/src/text/word_boundary.rs index 5068d6e06..761dab3d4 100644 --- a/src/text/word_boundary.rs +++ b/src/text/word_boundary.rs @@ -172,8 +172,18 @@ impl DocumentScript { has_cjk = true; } - // Check for complex scripts + // Check for complex scripts. The Brahmic South-Asian blocks + // (Bengali, Tamil, Telugu, Kannada, Malayalam) were previously + // absent, so those docs classified as Latin/Mixed and never reached + // the complex-script boundary rules — leaking spurious spaces after + // matras (#656-class Indic gap). They share the same matra/virama + // boundary semantics as Devanagari. if (0x0900..=0x097F).contains(&ch.code) // Devanagari + || (0x0980..=0x09FF).contains(&ch.code) // Bengali + || (0x0B80..=0x0BFF).contains(&ch.code) // Tamil + || (0x0C00..=0x0C7F).contains(&ch.code) // Telugu + || (0x0C80..=0x0CFF).contains(&ch.code) // Kannada + || (0x0D00..=0x0D7F).contains(&ch.code) // Malayalam || (0x0E00..=0x0E7F).contains(&ch.code) // Thai || (0x1780..=0x17FF).contains(&ch.code) { diff --git a/tests/article_threads.rs b/tests/article_threads.rs new file mode 100644 index 000000000..67993fdac --- /dev/null +++ b/tests/article_threads.rs @@ -0,0 +1,111 @@ +//! #458: article-thread (`/Threads`) parsing — ISO 32000-1:2008 §12.4.3. +//! +//! Hand-builds a minimal PDF with one thread of three beads (a circular +//! doubly-linked list) spanning two pages, and asserts the parser walks the +//! chain in `/N` order, resolves each bead's `/P` page and `/R` rectangle, and +//! terminates on the circular wrap. No external fixtures. + +use pdf_oxide::structure::parse_article_threads; +use pdf_oxide::PdfDocument; + +fn obj(buf: &mut Vec, offsets: &mut [usize], id: usize, body: &str) { + offsets[id] = buf.len(); + buf.extend_from_slice(format!("{id} 0 obj\n").as_bytes()); + buf.extend_from_slice(body.as_bytes()); + buf.extend_from_slice(b"\nendobj\n"); +} + +/// One thread, three beads (A→B→C→A), beads A/B on page 0 and C on page 1. +fn threaded_pdf() -> Vec { + let mut buf = Vec::new(); + let mut off = vec![0usize; 9]; // ids 1..=8 + buf.extend_from_slice(b"%PDF-1.7\n%\xE2\xE3\xCF\xD3\n"); + + obj(&mut buf, &mut off, 1, "<< /Type /Catalog /Pages 2 0 R /Threads [5 0 R] >>"); + obj(&mut buf, &mut off, 2, "<< /Type /Pages /Kids [3 0 R 4 0 R] /Count 2 >>"); + obj(&mut buf, &mut off, 3, "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>"); + obj(&mut buf, &mut off, 4, "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>"); + obj( + &mut buf, + &mut off, + 5, + "<< /Type /Thread /F 6 0 R /I << /Title (Cover Story) >> >>", + ); + // Bead A (page 0) + obj( + &mut buf, + &mut off, + 6, + "<< /Type /Bead /T 5 0 R /N 7 0 R /V 8 0 R /P 3 0 R /R [50 600 300 700] >>", + ); + // Bead B (page 0) + obj( + &mut buf, + &mut off, + 7, + "<< /Type /Bead /N 8 0 R /V 6 0 R /P 3 0 R /R [320 600 560 700] >>", + ); + // Bead C (page 1), /N wraps back to A + obj( + &mut buf, + &mut off, + 8, + "<< /Type /Bead /N 6 0 R /V 7 0 R /P 4 0 R /R [50 600 560 700] >>", + ); + + let xref_off = buf.len(); + buf.extend_from_slice(b"xref\n0 9\n0000000000 65535 f \n"); + for id in 1..=8 { + buf.extend_from_slice(format!("{:010} 00000 n \n", off[id]).as_bytes()); + } + buf.extend_from_slice(b"trailer\n<< /Size 9 /Root 1 0 R >>\nstartxref\n"); + buf.extend_from_slice(format!("{xref_off}\n%%EOF\n").as_bytes()); + buf +} + +#[test] +fn parses_thread_chain_in_order_and_terminates() { + let doc = PdfDocument::from_bytes(threaded_pdf()).unwrap(); + let threads = parse_article_threads(&doc); + + assert_eq!(threads.len(), 1, "exactly one thread"); + let t = &threads[0]; + assert_eq!(t.title.as_deref(), Some("Cover Story")); + assert_eq!(t.beads.len(), 3, "three beads; circular /N must terminate"); + + // Chain order A, B, C. + assert_eq!(t.beads[0].page_index, 0); + assert_eq!(t.beads[1].page_index, 0); + assert_eq!(t.beads[2].page_index, 1); + + // Bead A rect [50 600 300 700] -> x=50 y=600 w=250 h=100. + let a = &t.beads[0].rect; + assert!((a.x - 50.0).abs() < 1e-3 && (a.y - 600.0).abs() < 1e-3); + assert!((a.width - 250.0).abs() < 1e-3 && (a.height - 100.0).abs() < 1e-3); + + // Bead C spans the full text column on page 1. + let c = &t.beads[2].rect; + assert!((c.width - 510.0).abs() < 1e-3); +} + +#[test] +fn document_without_threads_yields_none() { + // The form fixture builder elsewhere proves threadless docs parse to empty; + // here a trivially-threadless catalog must give zero threads. + let mut buf = Vec::new(); + let mut off = vec![0usize; 4]; + buf.extend_from_slice(b"%PDF-1.7\n"); + obj(&mut buf, &mut off, 1, "<< /Type /Catalog /Pages 2 0 R >>"); + obj(&mut buf, &mut off, 2, "<< /Type /Pages /Kids [3 0 R] /Count 1 >>"); + obj(&mut buf, &mut off, 3, "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>"); + let xref_off = buf.len(); + buf.extend_from_slice(b"xref\n0 4\n0000000000 65535 f \n"); + for id in 1..=3 { + buf.extend_from_slice(format!("{:010} 00000 n \n", off[id]).as_bytes()); + } + buf.extend_from_slice(b"trailer\n<< /Size 4 /Root 1 0 R >>\nstartxref\n"); + buf.extend_from_slice(format!("{xref_off}\n%%EOF\n").as_bytes()); + + let doc = PdfDocument::from_bytes(buf).unwrap(); + assert!(parse_article_threads(&doc).is_empty()); +} diff --git a/tests/core_parity.rs b/tests/core_parity.rs new file mode 100644 index 000000000..792d6ec6f --- /dev/null +++ b/tests/core_parity.rs @@ -0,0 +1,97 @@ +//! Core functional test-parity suite (Rust) — the reference implementation of +//! the shared cross-language spec +//! (docs/releases/plans/v0.3.61/core-test-parity-spec.md). Every binding mirrors +//! these behaviors with its own idiomatic API. (Search is a binding-level +//! convenience and has no single Rust-core method, so it is covered in the +//! bindings, not here.) + +use pdf_oxide::converters::ConversionOptions; +use pdf_oxide::writer::DocumentBuilder; +use pdf_oxide::PdfDocument; + +fn fixture_bytes() -> Vec { + std::fs::read("tests/fixtures/simple.pdf").expect("simple.pdf fixture") +} + +fn open() -> PdfDocument { + PdfDocument::from_bytes(fixture_bytes()).expect("open simple.pdf") +} + +fn build_bytes() -> Vec { + let mut b = DocumentBuilder::new(); + b.letter_page() + .font("Helvetica", 12.0) + .at(72.0, 720.0) + .heading(1, "Core Parity") + .at(72.0, 690.0) + .paragraph("Functional parity across all language bindings.") + .done(); + b.build().expect("build pdf") +} + +#[test] +fn open_and_page_count() { + assert_eq!(open().page_count().unwrap(), 1); +} + +#[test] +fn extract_text() { + let _: String = open().extract_text(0).unwrap(); +} + +#[test] +fn convert_markdown_html_plain() { + let doc = open(); + let o = ConversionOptions::default(); + let _ = doc.to_markdown(0, &o).unwrap(); + let _ = doc.to_html(0, &o).unwrap(); + let _ = doc.to_plain_text(0, &o).unwrap(); +} + +#[test] +fn structured() { + let _ = open().extract_structured(0).unwrap(); +} + +#[test] +fn create_pdf() { + assert!(build_bytes().starts_with(b"%PDF")); +} + +#[test] +fn from_bytes_page_count() { + assert_eq!( + PdfDocument::from_bytes(build_bytes()) + .unwrap() + .page_count() + .unwrap(), + 1 + ); +} + +#[test] +fn encrypt_roundtrip() { + let plain = build_bytes(); + let mut b = DocumentBuilder::new(); + b.letter_page() + .font("Helvetica", 12.0) + .at(72.0, 720.0) + .paragraph("secret") + .done(); + let enc = b.to_bytes_encrypted("user123", "owner123").unwrap(); + assert!(enc.starts_with(b"%PDF")); + assert_ne!(enc, plain, "encryption must change the bytes"); +} + +#[test] +fn open_error() { + assert!( + PdfDocument::from_bytes(b"this is not a pdf".to_vec()).is_err(), + "opening non-PDF bytes must error" + ); +} + +#[test] +fn version() { + assert_eq!(env!("CARGO_PKG_VERSION"), "0.3.61"); +} diff --git a/tests/form_fill_need_appearances.rs b/tests/form_fill_need_appearances.rs new file mode 100644 index 000000000..7d8aee91c --- /dev/null +++ b/tests/form_fill_need_appearances.rs @@ -0,0 +1,122 @@ +//! #647: filling a field on a PDF whose AcroForm is an *inline* dictionary in +//! the catalog (and whose widget carries an empty `/AP /N`) must set +//! `/NeedAppearances true` so viewers regenerate the appearance from `/V`+`/DA` +//! instead of honouring the empty appearance stream and rendering the field +//! blank (ISO 32000-1:2008 §12.7.3.3; Table 226 — a present `/AP` takes +//! precedence over `/DA`). +//! +//! The reporter's sample is an external PDF, so this test hand-builds a minimal +//! PDF with the same structure (inline AcroForm, merged field+widget, empty AP, +//! NO /NeedAppearances) — no external/MPL fixtures. + +use pdf_oxide::editor::form_fields::FormFieldValue; +use pdf_oxide::editor::DocumentEditor; +use pdf_oxide::extractors::forms::FormExtractor; +use pdf_oxide::PdfDocument; + +/// Append `id 0 obj\n\nendobj\n`, recording the object's byte offset. +fn obj(buf: &mut Vec, offsets: &mut [usize], id: usize, body: &str) { + offsets[id] = buf.len(); + buf.extend_from_slice(format!("{id} 0 obj\n").as_bytes()); + buf.extend_from_slice(body.as_bytes()); + buf.extend_from_slice(b"\nendobj\n"); +} + +/// A minimal one-field PDF whose AcroForm is INLINE in the catalog, with no +/// `/NeedAppearances`, and whose widget has an empty `/AP /N` form XObject. +fn inline_acroform_form() -> Vec { + let mut buf: Vec = Vec::new(); + let mut off = vec![0usize; 7]; // ids 1..=6 + + buf.extend_from_slice(b"%PDF-1.7\n%\xE2\xE3\xCF\xD3\n"); + + // 1: catalog with an INLINE AcroForm (no /NeedAppearances) + obj( + &mut buf, + &mut off, + 1, + "<< /Type /Catalog /Pages 2 0 R \ + /AcroForm << /Fields [4 0 R] /DA (/Helv 0 Tf 0 g) \ + /DR << /Font << /Helv 5 0 R >> >> >> >>", + ); + // 2: page tree + obj(&mut buf, &mut off, 2, "<< /Type /Pages /Kids [3 0 R] /Count 1 >>"); + // 3: page, references the widget in /Annots + obj( + &mut buf, + &mut off, + 3, + "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \ + /Resources << >> /Annots [4 0 R] >>", + ); + // 4: merged text field + widget annotation, empty /AP /N -> obj 6 + obj( + &mut buf, + &mut off, + 4, + "<< /FT /Tx /T (full_name) /Type /Annot /Subtype /Widget \ + /Rect [72 700 400 720] /DA (/Helv 0 Tf 0 g) /AP << /N 6 0 R >> >>", + ); + // 5: Helvetica font for /DR + obj(&mut buf, &mut off, 5, "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>"); + // 6: empty appearance stream (the blank field the user sees today) + let content = "/Tx BMC\nEMC\n"; + off[6] = buf.len(); + buf.extend_from_slice(b"6 0 obj\n"); + buf.extend_from_slice( + format!( + "<< /Type /XObject /Subtype /Form /BBox [0 0 328 20] /Length {} >>\nstream\n", + content.len() + ) + .as_bytes(), + ); + buf.extend_from_slice(content.as_bytes()); + buf.extend_from_slice(b"endstream\nendobj\n"); + + // xref + let xref_off = buf.len(); + buf.extend_from_slice(b"xref\n0 7\n0000000000 65535 f \n"); + for id in 1..=6 { + buf.extend_from_slice(format!("{:010} 00000 n \n", off[id]).as_bytes()); + } + buf.extend_from_slice(b"trailer\n<< /Size 7 /Root 1 0 R >>\nstartxref\n"); + buf.extend_from_slice(format!("{xref_off}\n%%EOF\n").as_bytes()); + buf +} + +fn contains_ascii(hay: &[u8], needle: &[u8]) -> bool { + hay.windows(needle.len()).any(|w| w == needle) +} + +#[test] +fn inline_acroform_fill_sets_need_appearances() { + let form = inline_acroform_form(); + + // Sanity: the synthetic form parses and exposes exactly one field. + let doc0 = PdfDocument::from_bytes(form.clone()).unwrap(); + assert_eq!( + FormExtractor::extract_fields(&doc0).unwrap().len(), + 1, + "synthetic inline-AcroForm form should have 1 field" + ); + + let mut ed = DocumentEditor::from_bytes(form).unwrap(); + ed.set_form_field_value("full_name", FormFieldValue::Text("Hello".into())) + .unwrap(); + // Default save path (full rewrite) — the path the Python `save()` hits. + let out = ed.save_to_bytes().unwrap(); + + // #647: the saved PDF must carry /NeedAppearances true so the empty /AP /N + // is regenerated by the viewer rather than rendered blank. + assert!( + contains_ascii(&out, b"/NeedAppearances") && contains_ascii(&out, b"NeedAppearances true"), + "filled inline-AcroForm PDF must set /NeedAppearances true (#647)" + ); + + // The value must still round-trip. + let doc = PdfDocument::from_bytes(out).unwrap(); + let fields = FormExtractor::extract_fields(&doc).unwrap(); + assert_eq!(fields.len(), 1, "field must survive fill+save"); + let v = format!("{:?}", fields[0].value); + assert!(v.contains("Hello"), "re-read value must equal Hello; got {v}"); +} diff --git a/wasm-pkg/package.json b/wasm-pkg/package.json index a54afee88..994d13afd 100644 --- a/wasm-pkg/package.json +++ b/wasm-pkg/package.json @@ -1,6 +1,6 @@ { "name": "pdf-oxide-wasm", - "version": "0.3.60", + "version": "0.3.61", "description": "Fast, zero-dependency PDF toolkit for Node.js, browsers, and edge runtimes — text extraction, markdown/HTML conversion, search, form filling, creation, and editing. Rust core compiled to WebAssembly.", "license": "MIT OR Apache-2.0", "repository": {