diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index cae620baf46c..0b5c78405b10 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -42,7 +42,7 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Install cargo-audit - uses: taiki-e/install-action@ebb229c6baa68383264f2822689b07b4916d9177 # v2.62.36 + uses: taiki-e/install-action@44c6d64aa62cd779e873306675c7a58e86d6d532 # v2.62.49 with: tool: cargo-audit - name: Run audit check diff --git a/.github/workflows/extended.yml b/.github/workflows/extended.yml index 23bd66a0cf35..85e40731a959 100644 --- a/.github/workflows/extended.yml +++ b/.github/workflows/extended.yml @@ -44,6 +44,7 @@ on: - 'datafusion/physical*/**/*.rs' - 'datafusion/expr*/**/*.rs' - 'datafusion/optimizer/**/*.rs' + - 'datafusion-testing' workflow_dispatch: inputs: pr_number: @@ -123,7 +124,7 @@ jobs: --lib \ --tests \ --bins \ - --features avro,json,backtrace,extended_tests,recursive_protection + --features avro,json,backtrace,extended_tests,recursive_protection,parquet_encryption - name: Verify Working Directory Clean run: git diff --exit-code - name: Cleanup @@ -168,7 +169,7 @@ jobs: rust-version: stable - name: Run sqllogictest run: | - cargo test --features backtrace --profile release-nonlto --test sqllogictests -- --include-sqlite + cargo test --features backtrace,parquet_encryption --profile release-nonlto --test sqllogictests -- --include-sqlite cargo clean diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index e9606e15c4ec..9bdcade8eb2e 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -302,6 +302,15 @@ jobs: --features serde,avro,json,backtrace,integration-tests,parquet_encryption - name: Verify Working Directory Clean run: git diff --exit-code + # Check no temporary directories created during test. + # `false/` folder is excuded for rust cache. + - name: Verify Working Directory Clean (No Untracked Files) + run: | + STATUS="$(git status --porcelain | sed -e '/^?? false\/$/d' -e '/^?? false$/d')" + if [ -n "$STATUS" ]; then + echo "$STATUS" + exit 1 + fi # datafusion-cli tests linux-test-datafusion-cli: @@ -425,7 +434,7 @@ jobs: sudo apt-get update -qq sudo apt-get install -y -qq clang - name: Setup wasm-pack - uses: taiki-e/install-action@ebb229c6baa68383264f2822689b07b4916d9177 # v2.62.36 + uses: taiki-e/install-action@44c6d64aa62cd779e873306675c7a58e86d6d532 # v2.62.49 with: tool: wasm-pack - name: Run tests with headless mode @@ -466,7 +475,7 @@ jobs: export RUST_MIN_STACK=20971520 export TPCH_DATA=`realpath datafusion/sqllogictest/test_files/tpch/data` cargo test plan_q --package datafusion-benchmarks --profile ci --features=ci -- --test-threads=1 - INCLUDE_TPCH=true cargo test --features backtrace --profile ci --package datafusion-sqllogictest --test sqllogictests + INCLUDE_TPCH=true cargo test --features backtrace,parquet_encryption --profile ci --package datafusion-sqllogictest --test sqllogictests - name: Verify Working Directory Clean run: git diff --exit-code @@ -752,7 +761,7 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Install cargo-msrv - uses: taiki-e/install-action@ebb229c6baa68383264f2822689b07b4916d9177 # v2.62.36 + uses: taiki-e/install-action@44c6d64aa62cd779e873306675c7a58e86d6d532 # v2.62.49 with: tool: cargo-msrv @@ -797,4 +806,4 @@ jobs: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: persist-credentials: false - - uses: crate-ci/typos@80c8a4945eec0f6d464eaf9e65ed98ef085283d1 # v1.38.1 + - uses: crate-ci/typos@07d900b8fa1097806b8adb6391b0d3e0ac2fdea7 # v1.39.0 diff --git a/Cargo.lock b/Cargo.lock index e368dcf9a91e..9cf4d96415c0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -83,9 +83,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" dependencies = [ "memchr", ] @@ -128,9 +128,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "0.6.20" +version = "0.6.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" dependencies = [ "anstyle", "anstyle-parse", @@ -143,9 +143,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.11" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" [[package]] name = "anstyle-parse" @@ -203,14 +203,23 @@ dependencies = [ "serde_bytes", "serde_json", "snap", - "strum 0.27.2", - "strum_macros 0.27.2", + "strum", + "strum_macros", "thiserror", "uuid", "xz2", "zstd", ] +[[package]] +name = "ar_archive_writer" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0c269894b6fe5e9d7ada0cf69b5bf847ff35bc25fc271f08e1d080fce80339a" +dependencies = [ + "object", +] + [[package]] name = "arrayref" version = "0.3.9" @@ -225,9 +234,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e833808ff2d94ed40d9379848a950d995043c7fb3e81a30b383f4c6033821cc" +checksum = "4df8bb5b0bd64c0b9bc61317fcc480bad0f00e56d3bc32c69a4c8dada4786bae" dependencies = [ "arrow-arith", "arrow-array", @@ -249,23 +258,23 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad08897b81588f60ba983e3ca39bda2b179bdd84dced378e7df81a5313802ef8" +checksum = "a1a640186d3bd30a24cb42264c2dafb30e236a6f50d510e56d40b708c9582491" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "num", + "num-traits", ] [[package]] name = "arrow-array" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8548ca7c070d8db9ce7aa43f37393e4bfcf3f2d3681df278490772fd1673d08d" +checksum = "219fe420e6800979744c8393b687afb0252b3f8a89b91027d27887b72aa36d31" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -275,25 +284,28 @@ dependencies = [ "chrono-tz", "half", "hashbrown 0.16.0", - "num", + "num-complex", + "num-integer", + "num-traits", ] [[package]] name = "arrow-buffer" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e003216336f70446457e280807a73899dd822feaf02087d31febca1363e2fccc" +checksum = "76885a2697a7edf6b59577f568b456afc94ce0e2edc15b784ce3685b6c3c5c27" dependencies = [ "bytes", "half", - "num", + "num-bigint", + "num-traits", ] [[package]] name = "arrow-cast" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "919418a0681298d3a77d1a315f625916cb5678ad0d74b9c60108eb15fd083023" +checksum = "9c9ebb4c987e6b3b236fb4a14b20b34835abfdd80acead3ccf1f9bf399e1f168" dependencies = [ "arrow-array", "arrow-buffer", @@ -306,15 +318,15 @@ dependencies = [ "comfy-table", "half", "lexical-core", - "num", + "num-traits", "ryu", ] [[package]] name = "arrow-csv" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa9bf02705b5cf762b6f764c65f04ae9082c7cfc4e96e0c33548ee3f67012eb" +checksum = "92386159c8d4bce96f8bd396b0642a0d544d471bdc2ef34d631aec80db40a09c" dependencies = [ "arrow-array", "arrow-cast", @@ -327,21 +339,22 @@ dependencies = [ [[package]] name = "arrow-data" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5c64fff1d142f833d78897a772f2e5b55b36cb3e6320376f0961ab0db7bd6d0" +checksum = "727681b95de313b600eddc2a37e736dcb21980a40f640314dcf360e2f36bc89b" dependencies = [ "arrow-buffer", "arrow-schema", "half", - "num", + "num-integer", + "num-traits", ] [[package]] name = "arrow-flight" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c8b0ba0784d56bc6266b79f5de7a24b47024e7b3a0045d2ad4df3d9b686099f" +checksum = "f70bb56412a007b0cfc116d15f24dda6adeed9611a213852a004cda20085a3b9" dependencies = [ "arrow-arith", "arrow-array", @@ -359,16 +372,17 @@ dependencies = [ "futures", "once_cell", "paste", - "prost 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-types", "tonic", + "tonic-prost", ] [[package]] name = "arrow-ipc" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d3594dcddccc7f20fd069bc8e9828ce37220372680ff638c5e00dea427d88f5" +checksum = "da9ba92e3de170295c98a84e5af22e2b037f0c7b32449445e6c493b5fca27f27" dependencies = [ "arrow-array", "arrow-buffer", @@ -382,9 +396,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88cf36502b64a127dc659e3b305f1d993a544eab0d48cce704424e62074dc04b" +checksum = "b969b4a421ae83828591c6bf5450bd52e6d489584142845ad6a861f42fe35df8" dependencies = [ "arrow-array", "arrow-buffer", @@ -394,19 +408,21 @@ dependencies = [ "chrono", "half", "indexmap 2.12.0", + "itoa", "lexical-core", "memchr", - "num", - "serde", + "num-traits", + "ryu", + "serde_core", "serde_json", "simdutf8", ] [[package]] name = "arrow-ord" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c8f82583eb4f8d84d4ee55fd1cb306720cddead7596edce95b50ee418edf66f" +checksum = "141c05298b21d03e88062317a1f1a73f5ba7b6eb041b350015b1cd6aabc0519b" dependencies = [ "arrow-array", "arrow-buffer", @@ -417,9 +433,9 @@ dependencies = [ [[package]] name = "arrow-pyarrow" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d924b32e96f8bb74d94cd82bd97b313c432fcb0ea331689ef9e7c6b8be4b258" +checksum = "cfcfb2be2e9096236f449c11f425cddde18c4cc540f516d90f066f10a29ed515" dependencies = [ "arrow-array", "arrow-data", @@ -429,9 +445,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d07ba24522229d9085031df6b94605e0f4b26e099fb7cdeec37abd941a73753" +checksum = "c5f3c06a6abad6164508ed283c7a02151515cef3de4b4ff2cebbcaeb85533db2" dependencies = [ "arrow-array", "arrow-buffer", @@ -442,34 +458,35 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3aa9e59c611ebc291c28582077ef25c97f1975383f1479b12f3b9ffee2ffabe" +checksum = "9cfa7a03d1eee2a4d061476e1840ad5c9867a544ca6c4c59256496af5d0a8be5" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "serde", + "serde_core", "serde_json", ] [[package]] name = "arrow-select" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c41dbbd1e97bfcaee4fcb30e29105fb2c75e4d82ae4de70b792a5d3f66b2e7a" +checksum = "bafa595babaad59f2455f4957d0f26448fb472722c186739f4fac0823a1bdb47" dependencies = [ "ahash 0.8.12", "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", - "num", + "num-traits", ] [[package]] name = "arrow-string" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53f5183c150fbc619eede22b861ea7c0eebed8eaac0333eaa7f6da5205fd504d" +checksum = "32f46457dbbb99f2650ff3ac23e46a929e0ab81db809b02aa5511c258348bef2" dependencies = [ "arrow-array", "arrow-buffer", @@ -477,7 +494,7 @@ dependencies = [ "arrow-schema", "arrow-select", "memchr", - "num", + "num-traits", "regex", "regex-syntax", ] @@ -494,6 +511,22 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "astral-tokio-tar" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec179a06c1769b1e42e1e2cbe74c7dcdb3d6383c838454d063eaac5bbb7ebbe5" +dependencies = [ + "filetime", + "futures-core", + "libc", + "portable-atomic", + "rustc-hash", + "tokio", + "tokio-stream", + "xattr", +] + [[package]] name = "async-compression" version = "0.4.19" @@ -528,7 +561,29 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", +] + +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.110", ] [[package]] @@ -539,7 +594,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -565,9 +620,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "aws-config" -version = "1.8.7" +version = "1.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04b37ddf8d2e9744a0b9c19ce0b78efe4795339a90b66b7bae77987092cd2e69" +checksum = "1856b1b48b65f71a4dd940b1c0931f9a7b646d4a924b9828ffefc1454714668a" dependencies = [ "aws-credential-types", "aws-runtime", @@ -595,9 +650,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "1.2.7" +version = "1.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "799a1290207254984cb7c05245111bc77958b92a3c9bb449598044b36341cce6" +checksum = "86590e57ea40121d47d3f2e131bfd873dea15d78dc2f4604f4734537ad9e56c4" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -607,9 +662,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.14.0" +version = "1.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94b8ff6c09cd57b16da53641caa860168b88c172a5ee163b0288d3d6eea12786" +checksum = "879b6c89592deb404ba4dc0ae6b58ffd1795c78991cbb5b8bc441c48a070440d" dependencies = [ "aws-lc-sys", "zeroize", @@ -617,9 +672,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.31.0" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e44d16778acaf6a9ec9899b92cebd65580b83f685446bf2e1f5d3d732f99dcd" +checksum = "107a4e9d9cab9963e04e84bb8dee0e25f2a987f9a8bad5ed054abd439caa8f8c" dependencies = [ "bindgen", "cc", @@ -630,9 +685,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.11" +version = "1.5.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e1ed337dabcf765ad5f2fb426f13af22d576328aaf09eac8f70953530798ec0" +checksum = "8fe0fd441565b0b318c76e7206c8d1d0b0166b3e986cf30e890b61feb6192045" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -654,9 +709,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.85.0" +version = "1.89.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f2c741e2e439f07b5d1b33155e246742353d82167c785a2ff547275b7e32483" +checksum = "a9c1b1af02288f729e95b72bd17988c009aa72e26dcb59b3200f86d7aea726c9" dependencies = [ "aws-credential-types", "aws-runtime", @@ -676,9 +731,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.87.0" +version = "1.91.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6428ae5686b18c0ee99f6f3c39d94ae3f8b42894cdc35c35d8fb2470e9db2d4c" +checksum = "4e8122301558dc7c6c68e878af918880b82ff41897a60c8c4e18e4dc4d93e9f1" dependencies = [ "aws-credential-types", "aws-runtime", @@ -698,9 +753,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.87.0" +version = "1.91.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5871bec9a79a3e8d928c7788d654f135dde0e71d2dd98089388bab36b37ef607" +checksum = "8f8090151d4d1e971269957b10dbf287bba551ab812e591ce0516b1c73b75d27" dependencies = [ "aws-credential-types", "aws-runtime", @@ -721,9 +776,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.3.4" +version = "1.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084c34162187d39e3740cb635acd73c4e3a551a36146ad6fe8883c929c9f876c" +checksum = "c35452ec3f001e1f2f6db107b6373f1f48f05ec63ba2c5c9fa91f07dad32af11" dependencies = [ "aws-credential-types", "aws-smithy-http", @@ -743,9 +798,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.2.5" +version = "1.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e190749ea56f8c42bf15dd76c65e14f8f765233e6df9b0506d9d934ebef867c" +checksum = "127fcfad33b7dfc531141fda7e1c402ac65f88aca5511a4d31e2e3d2cd01ce9c" dependencies = [ "futures-util", "pin-project-lite", @@ -754,15 +809,16 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.62.3" +version = "0.62.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c4dacf2d38996cf729f55e7a762b30918229917eca115de45dfa8dfb97796c9" +checksum = "445d5d720c99eed0b4aa674ed00d835d9b1427dd73e04adaf2f94c6b2d6f9fca" dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "bytes", "bytes-utils", "futures-core", + "futures-util", "http 0.2.12", "http 1.3.1", "http-body 0.4.6", @@ -774,9 +830,9 @@ dependencies = [ [[package]] name = "aws-smithy-http-client" -version = "1.1.1" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "147e8eea63a40315d704b97bf9bc9b8c1402ae94f89d5ad6f7550d963309da1b" +checksum = "623254723e8dfd535f566ee7b2381645f8981da086b5c4aa26c0c41582bb1d2c" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -798,27 +854,27 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.61.5" +version = "0.61.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eaa31b350998e703e9826b2104dd6f63be0508666e1aba88137af060e8944047" +checksum = "2db31f727935fc63c6eeae8b37b438847639ec330a9161ece694efba257e0c54" dependencies = [ "aws-smithy-types", ] [[package]] name = "aws-smithy-observability" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9364d5989ac4dd918e5cc4c4bdcc61c9be17dcd2586ea7f69e348fc7c6cab393" +checksum = "2d1881b1ea6d313f9890710d65c158bdab6fb08c91ea825f74c1c8c357baf4cc" dependencies = [ "aws-smithy-runtime-api", ] [[package]] name = "aws-smithy-query" -version = "0.60.7" +version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb" +checksum = "d28a63441360c477465f80c7abac3b9c4d075ca638f982e605b7dc2a2c7156c9" dependencies = [ "aws-smithy-types", "urlencoding", @@ -826,9 +882,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.9.2" +version = "1.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fa63ad37685ceb7762fa4d73d06f1d5493feb88e3f27259b9ed277f4c01b185" +checksum = "0bbe9d018d646b96c7be063dd07987849862b0e6d07c778aad7d93d1be6c1ef0" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -850,9 +906,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.9.0" +version = "1.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07f5e0fc8a6b3f2303f331b94504bbf754d85488f402d6f1dd7a6080f99afe56" +checksum = "ec7204f9fd94749a7c53b26da1b961b4ac36bf070ef1e0b94bb09f79d4f6c193" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -867,9 +923,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.3.2" +version = "1.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d498595448e43de7f4296b7b7a18a8a02c61ec9349128c80a368f7c3b4ab11a8" +checksum = "25f535879a207fce0db74b679cfc3e91a3159c8144d717d55f5832aea9eef46e" dependencies = [ "base64-simd", "bytes", @@ -890,18 +946,18 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.60.10" +version = "0.60.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3db87b96cb1b16c024980f133968d52882ca0daaee3a086c6decc500f6c99728" +checksum = "eab77cdd036b11056d2a30a7af7b775789fb024bf216acc13884c6c97752ae56" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "1.3.8" +version = "1.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b069d19bf01e46298eaedd7c6f283fe565a59263e53eebec945f3e6398f42390" +checksum = "d79fb68e3d7fe5d4833ea34dc87d2e97d26d3086cb3da660bb6b1f76d98680b6" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -913,9 +969,9 @@ dependencies = [ [[package]] name = "axum" -version = "0.8.4" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "021e862c184ae977658b36c4500f7feac3221ca5da43e3f25bd04ab6c79a29b5" +checksum = "8a18ed336352031311f4e0b4dd2ff392d4fbb370777c9d18d7fc9d7359f73871" dependencies = [ "axum-core", "bytes", @@ -929,8 +985,7 @@ dependencies = [ "mime", "percent-encoding", "pin-project-lite", - "rustversion", - "serde", + "serde_core", "sync_wrapper", "tower", "tower-layer", @@ -939,9 +994,9 @@ dependencies = [ [[package]] name = "axum-core" -version = "0.5.2" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68464cd0412f486726fb3373129ef5d2993f90c34bc2bc1c1e9943b2f4fc7ca6" +checksum = "59446ce19cd142f8833f856eb31f3eb097812d1479ab224f54d72428ca21ea22" dependencies = [ "bytes", "futures-core", @@ -950,7 +1005,6 @@ dependencies = [ "http-body-util", "mime", "pin-project-lite", - "rustversion", "sync_wrapper", "tower-layer", "tower-service", @@ -980,9 +1034,9 @@ dependencies = [ [[package]] name = "bigdecimal" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a22f228ab7a1b23027ccc6c350b72868017af7ea8356fbdf19f8d991c690013" +checksum = "560f42649de9fa436b73517378a147ec21f6c997a546581df4b4b31677828934" dependencies = [ "autocfg", "libm", @@ -998,7 +1052,7 @@ version = "0.72.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "cexpr", "clang-sys", "itertools 0.13.0", @@ -1009,7 +1063,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -1020,9 +1074,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.9.4" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" [[package]] name = "bitvec" @@ -1069,13 +1123,17 @@ dependencies = [ [[package]] name = "bollard" -version = "0.18.1" +version = "0.19.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97ccca1260af6a459d75994ad5acc1651bcabcbdbc41467cc9786519ab854c30" +checksum = "87a52479c9237eb04047ddb94788c41ca0d26eaff8b697ecfbb4c32f7fdc3b1b" dependencies = [ + "async-stream", "base64 0.22.1", + "bitflags 2.10.0", + "bollard-buildkit-proto", "bollard-stubs", "bytes", + "chrono", "futures-core", "futures-util", "hex", @@ -1088,7 +1146,9 @@ dependencies = [ "hyper-util", "hyperlocal", "log", + "num", "pin-project-lite", + "rand 0.9.2", "rustls", "rustls-native-certs", "rustls-pemfile", @@ -1100,28 +1160,49 @@ dependencies = [ "serde_urlencoded", "thiserror", "tokio", + "tokio-stream", "tokio-util", + "tonic", "tower-service", "url", "winapi", ] +[[package]] +name = "bollard-buildkit-proto" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85a885520bf6249ab931a764ffdb87b0ceef48e6e7d807cfdb21b751e086e1ad" +dependencies = [ + "prost", + "prost-types", + "tonic", + "tonic-prost", + "ureq", +] + [[package]] name = "bollard-stubs" -version = "1.47.1-rc.27.3.1" +version = "1.49.1-rc.28.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f179cfbddb6e77a5472703d4b30436bff32929c0aa8a9008ecf23d1d3cdd0da" +checksum = "5731fe885755e92beff1950774068e0cae67ea6ec7587381536fca84f1779623" dependencies = [ + "base64 0.22.1", + "bollard-buildkit-proto", + "bytes", + "chrono", + "prost", "serde", + "serde_json", "serde_repr", "serde_with", ] [[package]] name = "bon" -version = "3.7.2" +version = "3.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2529c31017402be841eb45892278a6c21a000c0a17643af326c73a73f83f0fb" +checksum = "ebeb9aaf9329dff6ceb65c689ca3db33dbf15f324909c60e4e5eef5701ce31b1" dependencies = [ "bon-macros", "rustversion", @@ -1129,9 +1210,9 @@ dependencies = [ [[package]] name = "bon-macros" -version = "3.7.2" +version = "3.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82020dadcb845a345591863adb65d74fa8dc5c18a0b6d408470e13b7adc7005" +checksum = "77e9d642a7e3a318e37c2c9427b5a6a48aa1ad55dcd986f3034ab2239045a645" dependencies = [ "darling", "ident_case", @@ -1139,7 +1220,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -1162,7 +1243,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -1188,9 +1269,9 @@ dependencies = [ [[package]] name = "bstr" -version = "1.12.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" dependencies = [ "memchr", "serde", @@ -1282,9 +1363,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.2.38" +version = "1.2.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80f41ae168f955c12fb8960b057d70d0ca153fb83182b57d86380443527be7e9" +checksum = "35900b6c8d709fb1d854671ae27aeaa9eec2f8b01b364e1619a40da3e6fe2afe" dependencies = [ "find-msvc-tools", "jobserver", @@ -1303,9 +1384,9 @@ dependencies = [ [[package]] name = "cfg-if" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "cfg_aliases" @@ -1324,7 +1405,7 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-link 0.2.0", + "windows-link 0.2.1", ] [[package]] @@ -1388,9 +1469,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.48" +version = "4.5.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2134bb3ea021b78629caa971416385309e0131b351b25e01dc16fb54e1b5fae" +checksum = "4c26d721170e0295f191a69bd9a1f93efcdb0aff38684b61ab5750468972e5f5" dependencies = [ "clap_builder", "clap_derive", @@ -1398,9 +1479,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.48" +version = "4.5.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2ba64afa3c0a6df7fa517765e31314e983f51dda798ffba27b988194fb65dc9" +checksum = "75835f0c7bf681bfd05abe44e965760fea999a5286c6eb2d59883634fd02011a" dependencies = [ "anstream", "anstyle", @@ -1410,21 +1491,21 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.47" +version = "4.5.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbfd7eae0b0f1a6e63d4b13c9c478de77c2eb546fba158ad50b4203dc24b9f9c" +checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "clap_lex" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" +checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "clipboard-win" @@ -1452,13 +1533,12 @@ checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "comfy-table" -version = "7.1.2" +version = "7.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" +checksum = "b03b7db8e0b4b2fdad6c551e634134e99ec000e5c8c3b6856c65e8bbaded7a3b" dependencies = [ - "strum 0.26.3", - "strum_macros 0.26.4", - "unicode-width 0.2.1", + "unicode-segmentation", + "unicode-width 0.2.2", ] [[package]] @@ -1482,8 +1562,8 @@ dependencies = [ "encode_unicode", "libc", "once_cell", - "unicode-width 0.2.1", - "windows-sys 0.61.0", + "unicode-width 0.2.2", + "windows-sys 0.61.2", ] [[package]] @@ -1582,26 +1662,23 @@ dependencies = [ [[package]] name = "criterion" -version = "0.5.1" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +checksum = "e1c047a62b0cc3e145fa84415a3191f628e980b194c2755aa12300a4e6cbd928" dependencies = [ "anes", "cast", "ciborium", - "clap 4.5.48", + "clap 4.5.51", "criterion-plot", "futures", - "is-terminal", - "itertools 0.10.5", + "itertools 0.13.0", "num-traits", - "once_cell", "oorandom", "plotters", "rayon", "regex", "serde", - "serde_derive", "serde_json", "tinytemplate", "tokio", @@ -1610,12 +1687,12 @@ dependencies = [ [[package]] name = "criterion-plot" -version = "0.5.0" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +checksum = "9b1bcc0dc7dfae599d84ad0b1a55f80cde8af3725da8313b528da95ef783e338" dependencies = [ "cast", - "itertools 0.10.5", + "itertools 0.13.0", ] [[package]] @@ -1670,30 +1747,30 @@ dependencies = [ [[package]] name = "csv" -version = "1.3.1" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" dependencies = [ "csv-core", "itoa", "ryu", - "serde", + "serde_core", ] [[package]] name = "csv-core" -version = "0.1.12" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" dependencies = [ "memchr", ] [[package]] name = "ctor" -version = "0.4.3" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec09e802f5081de6157da9a75701d6c713d8dc3ba52571fd4bd25f412644e8a6" +checksum = "3ffc71fcdcdb40d6f087edddf7f8f1f8f79e6cf922f555a9ee8779752d4819bd" dependencies = [ "ctor-proc-macro", "dtor", @@ -1701,9 +1778,9 @@ dependencies = [ [[package]] name = "ctor-proc-macro" -version = "0.0.6" +version = "0.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2931af7e13dc045d8e9d26afccc6fa115d64e115c9c84b1166288b46f6782c2" +checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1" [[package]] name = "cty" @@ -1732,7 +1809,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -1743,7 +1820,7 @@ checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" dependencies = [ "darling_core", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -1762,7 +1839,7 @@ dependencies = [ [[package]] name = "datafusion" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "arrow-schema", @@ -1834,7 +1911,7 @@ dependencies = [ [[package]] name = "datafusion-benchmarks" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "datafusion", @@ -1859,7 +1936,7 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -1882,7 +1959,7 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -1900,19 +1977,18 @@ dependencies = [ "itertools 0.14.0", "log", "object_store", - "tokio", ] [[package]] name = "datafusion-cli" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", "aws-config", "aws-credential-types", "chrono", - "clap 4.5.48", + "clap 4.5.51", "ctor", "datafusion", "datafusion-common", @@ -1937,7 +2013,7 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "50.3.0" +version = "51.0.0" dependencies = [ "ahash 0.8.12", "apache-avro", @@ -1964,7 +2040,7 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "50.3.0" +version = "51.0.0" dependencies = [ "futures", "log", @@ -1973,7 +2049,7 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-compression", @@ -2008,7 +2084,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-arrow" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "arrow-ipc", @@ -2031,7 +2107,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-avro" -version = "50.3.0" +version = "51.0.0" dependencies = [ "apache-avro", "arrow", @@ -2050,7 +2126,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -2071,7 +2147,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -2091,7 +2167,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -2120,11 +2196,11 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "50.3.0" +version = "51.0.0" [[package]] name = "datafusion-examples" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "arrow-flight", @@ -2143,7 +2219,7 @@ dependencies = [ "mimalloc", "nix", "object_store", - "prost 0.13.5", + "prost", "rand 0.9.2", "serde_json", "tempfile", @@ -2158,7 +2234,7 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -2179,7 +2255,7 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -2203,7 +2279,7 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "datafusion-common", @@ -2214,7 +2290,7 @@ dependencies = [ [[package]] name = "datafusion-ffi" -version = "50.3.0" +version = "51.0.0" dependencies = [ "abi_stable", "arrow", @@ -2229,14 +2305,14 @@ dependencies = [ "doc-comment", "futures", "log", - "prost 0.13.5", + "prost", "semver", "tokio", ] [[package]] name = "datafusion-functions" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "arrow-buffer", @@ -2257,6 +2333,7 @@ dependencies = [ "itertools 0.14.0", "log", "md-5", + "num-traits", "rand 0.9.2", "regex", "sha2", @@ -2267,7 +2344,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "50.3.0" +version = "51.0.0" dependencies = [ "ahash 0.8.12", "arrow", @@ -2288,7 +2365,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "50.3.0" +version = "51.0.0" dependencies = [ "ahash 0.8.12", "arrow", @@ -2301,7 +2378,7 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "arrow-ord", @@ -2324,7 +2401,7 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -2338,7 +2415,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "datafusion-common", @@ -2354,7 +2431,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "50.3.0" +version = "51.0.0" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -2362,16 +2439,16 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "50.3.0" +version = "51.0.0" dependencies = [ "datafusion-doc", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "datafusion-optimizer" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", @@ -2398,7 +2475,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "50.3.0" +version = "51.0.0" dependencies = [ "ahash 0.8.12", "arrow", @@ -2423,7 +2500,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-adapter" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "datafusion-common", @@ -2436,7 +2513,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "50.3.0" +version = "51.0.0" dependencies = [ "ahash 0.8.12", "arrow", @@ -2448,7 +2525,7 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "datafusion-common", @@ -2468,7 +2545,7 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "50.3.0" +version = "51.0.0" dependencies = [ "ahash 0.8.12", "arrow", @@ -2504,7 +2581,7 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "chrono", @@ -2532,7 +2609,7 @@ dependencies = [ "object_store", "pbjson", "pretty_assertions", - "prost 0.13.5", + "prost", "serde", "serde_json", "tokio", @@ -2540,19 +2617,19 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "datafusion-common", "doc-comment", "pbjson", - "prost 0.13.5", + "prost", "serde", ] [[package]] name = "datafusion-pruning" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "datafusion-common", @@ -2570,7 +2647,7 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "50.3.0" +version = "51.0.0" dependencies = [ "async-trait", "datafusion-common", @@ -2582,7 +2659,7 @@ dependencies = [ [[package]] name = "datafusion-spark" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "bigdecimal", @@ -2602,7 +2679,7 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "bigdecimal", @@ -2628,14 +2705,14 @@ dependencies = [ [[package]] name = "datafusion-sqllogictest" -version = "50.3.0" +version = "51.0.0" dependencies = [ "arrow", "async-trait", "bigdecimal", "bytes", "chrono", - "clap 4.5.48", + "clap 4.5.51", "datafusion", "datafusion-spark", "datafusion-substrait", @@ -2662,7 +2739,7 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "50.3.0" +version = "51.0.0" dependencies = [ "async-recursion", "async-trait", @@ -2674,7 +2751,7 @@ dependencies = [ "itertools 0.14.0", "object_store", "pbjson-types", - "prost 0.13.5", + "prost", "serde_json", "substrait", "tokio", @@ -2684,7 +2761,7 @@ dependencies = [ [[package]] name = "datafusion-wasmtest" -version = "50.3.0" +version = "51.0.0" dependencies = [ "chrono", "console_error_panic_hook", @@ -2705,12 +2782,12 @@ dependencies = [ [[package]] name = "deranged" -version = "0.5.3" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d630bccd429a5bb5a64b5e94f693bfc48c9f8566418fda4c494cc94f911f87cc" +checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" dependencies = [ "powerfmt", - "serde", + "serde_core", ] [[package]] @@ -2748,7 +2825,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -2759,14 +2836,14 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "doc-comment" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" +checksum = "780955b8b195a21ab8e4ac6b60dd1dbdcec1dc6c51c0617964b08c81785e12c9" [[package]] name = "docker_credential" @@ -2781,18 +2858,18 @@ dependencies = [ [[package]] name = "dtor" -version = "0.0.6" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97cbdf2ad6846025e8e25df05171abfb30e3ababa12ee0a0e44b9bbe570633a8" +checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301" dependencies = [ "dtor-proc-macro", ] [[package]] name = "dtor-proc-macro" -version = "0.0.5" +version = "0.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7454e41ff9012c00d53cf7f475c5e3afa3b91b7c90568495495e8d9bf47a1055" +checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5" [[package]] name = "dunce" @@ -2815,7 +2892,7 @@ dependencies = [ "enum-ordinalize", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -2838,29 +2915,29 @@ checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" [[package]] name = "enum-ordinalize" -version = "4.3.0" +version = "4.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fea0dcfa4e54eeb516fe454635a95753ddd39acda650ce703031c6973e315dd5" +checksum = "4a1091a7bb1f8f2c4b28f1fe2cef4980ca2d410a3d727d67ecc3178c9b0800f0" dependencies = [ "enum-ordinalize-derive", ] [[package]] name = "enum-ordinalize-derive" -version = "4.3.1" +version = "4.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d28318a75d4aead5c4db25382e8ef717932d0346600cacae6357eb5941bc5ff" +checksum = "8ca9601fb2d62598ee17836250842873a413586e5d7ed88b356e38ddbb0ec631" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "env_filter" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" +checksum = "1bf3c259d255ca70051b30e2e95b5446cdb8949ac4cd22c0d7fd634d89f568e2" dependencies = [ "log", "regex", @@ -2892,7 +2969,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -2985,9 +3062,9 @@ dependencies = [ [[package]] name = "find-msvc-tools" -version = "0.1.2" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ced73b1dacfc750a6db6c0a0c3a3853c8b41997e2e2c563dc90804ae6867959" +checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" [[package]] name = "fixedbitset" @@ -2997,19 +3074,19 @@ checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" [[package]] name = "flatbuffers" -version = "25.2.10" +version = "25.9.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1" +checksum = "09b6620799e7340ebd9968d2e0708eb82cf1971e9a16821e2091b6d6e475eed5" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "rustc_version", ] [[package]] name = "flate2" -version = "1.1.4" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc5a4e564e38c699f2880d3fda590bedc2e69f3f84cd48b457bd892ce61d0aa9" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" dependencies = [ "crc32fast", "libz-rs-sys", @@ -3028,6 +3105,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -3039,9 +3122,9 @@ dependencies = [ [[package]] name = "fs-err" -version = "3.1.2" +version = "3.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44f150ffc8782f35521cec2b23727707cb4045706ba3c854e86bef66b3a8cdbd" +checksum = "6ad492b2cf1d89d568a43508ab24f98501fe03f2f31c01e1d0fe7366a71745d2" dependencies = [ "autocfg", ] @@ -3114,7 +3197,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -3157,16 +3240,16 @@ dependencies = [ name = "gen" version = "0.1.0" dependencies = [ - "pbjson-build 0.8.0", - "prost-build 0.14.1", + "pbjson-build", + "prost-build", ] [[package]] name = "gen-common" version = "0.1.0" dependencies = [ - "pbjson-build 0.8.0", - "prost-build 0.14.1", + "pbjson-build", + "prost-build", ] [[package]] @@ -3180,9 +3263,9 @@ dependencies = [ [[package]] name = "generic-array" -version = "0.14.7" +version = "0.14.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +checksum = "4bb6743198531e02858aeaea5398fcc883e71851fcbcb5a2f773e2fb6cb1edf2" dependencies = [ "typenum", "version_check", @@ -3223,9 +3306,9 @@ checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" [[package]] name = "globset" -version = "0.4.16" +version = "0.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54a1028dfc5f5df5da8a56a73e6c153c9a9708ec57232470703592a3f18e49f5" +checksum = "52dfc19153a48bde0cbd630453615c8151bce3a5adfac7a0aebfbf0a1e1f57e3" dependencies = [ "aho-corasick", "bstr", @@ -3255,9 +3338,9 @@ dependencies = [ [[package]] name = "half" -version = "2.7.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e54c115d4f30f52c67202f079c5f9d8b49db4691f460fdb0b4c2e838261b2ba5" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ "cfg-if", "crunchy", @@ -3290,9 +3373,7 @@ version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "allocator-api2", - "equivalent", - "foldhash", + "foldhash 0.1.5", ] [[package]] @@ -3300,6 +3381,11 @@ name = "hashbrown" version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] [[package]] name = "heck" @@ -3316,12 +3402,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" - [[package]] name = "hex" version = "0.4.3" @@ -3339,11 +3419,11 @@ dependencies = [ [[package]] name = "home" -version = "0.5.11" +version = "0.5.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" +checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -3506,7 +3586,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.0", + "socket2", "tokio", "tower-service", "tracing", @@ -3539,7 +3619,7 @@ dependencies = [ "js-sys", "log", "wasm-bindgen", - "windows-core 0.62.0", + "windows-core 0.62.2", ] [[package]] @@ -3553,9 +3633,9 @@ dependencies = [ [[package]] name = "icu_collections" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" dependencies = [ "displaydoc", "potential_utf", @@ -3566,9 +3646,9 @@ dependencies = [ [[package]] name = "icu_locale_core" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" dependencies = [ "displaydoc", "litemap", @@ -3579,11 +3659,10 @@ dependencies = [ [[package]] name = "icu_normalizer" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" dependencies = [ - "displaydoc", "icu_collections", "icu_normalizer_data", "icu_properties", @@ -3594,42 +3673,38 @@ dependencies = [ [[package]] name = "icu_normalizer_data" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" [[package]] name = "icu_properties" -version = "2.0.1" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" +checksum = "e93fcd3157766c0c8da2f8cff6ce651a31f0810eaa1c51ec363ef790bbb5fb99" dependencies = [ - "displaydoc", "icu_collections", "icu_locale_core", "icu_properties_data", "icu_provider", - "potential_utf", "zerotrie", "zerovec", ] [[package]] name = "icu_properties_data" -version = "2.0.1" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" +checksum = "02845b3647bb045f1100ecd6480ff52f34c35f82d9880e029d329c21d1054899" [[package]] name = "icu_provider" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" dependencies = [ "displaydoc", "icu_locale_core", - "stable_deref_trait", - "tinystr", "writeable", "yoke", "zerofrom", @@ -3689,22 +3764,25 @@ dependencies = [ [[package]] name = "indicatif" -version = "0.18.0" +version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70a646d946d06bedbbc4cac4c218acf4bbf2d87757a784857025f4d447e4e1cd" +checksum = "ade6dfcba0dfb62ad59e59e7241ec8912af34fd29e0e743e3db992bd278e8b65" dependencies = [ "console 0.16.1", "portable-atomic", - "unicode-width 0.2.1", + "unicode-width 0.2.2", "unit-prefix", "web-time", ] [[package]] name = "indoc" -version = "2.0.6" +version = "2.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] [[package]] name = "insta" @@ -3746,39 +3824,19 @@ checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "iri-string" -version = "0.7.8" +version = "0.7.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbc5ebe9c3a1a7a5127f920a418f7585e9e758e911d0466ed004f393b0e380b2" +checksum = "4f867b9d1d896b67beb18518eda36fdb77a32ea590de864f1325b294a6d14397" dependencies = [ "memchr", "serde", ] -[[package]] -name = "is-terminal" -version = "0.4.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" -dependencies = [ - "hermit-abi", - "libc", - "windows-sys 0.59.0", -] - [[package]] name = "is_terminal_polyfill" -version = "1.70.1" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" - -[[package]] -name = "itertools" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" -dependencies = [ - "either", -] +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" [[package]] name = "itertools" @@ -3806,26 +3864,26 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "jiff" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be1f93b8b1eb69c77f24bbb0afdf66f54b632ee39af40ca21c4365a1d7347e49" +checksum = "49cce2b81f2098e7e3efc35bc2e0a6b7abec9d34128283d7a26fa8f32a6dbb35" dependencies = [ "jiff-static", "log", "portable-atomic", "portable-atomic-util", - "serde", + "serde_core", ] [[package]] name = "jiff-static" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" +checksum = "980af8b43c3ad5d8d349ace167ec8170839f753a42d233ba19e08afe1850fa69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -3840,9 +3898,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.81" +version = "0.3.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305" +checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65" dependencies = [ "once_cell", "wasm-bindgen", @@ -3919,9 +3977,9 @@ checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" [[package]] name = "libc" -version = "0.2.176" +version = "0.2.177" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174" +checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" [[package]] name = "libloading" @@ -3940,7 +3998,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" dependencies = [ "cfg-if", - "windows-link 0.2.0", + "windows-link 0.2.1", ] [[package]] @@ -3966,9 +4024,9 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "416f7e718bdb06000964960ffa43b4335ad4012ae8b99060261aa4a8088d5ccb" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "libc", - "redox_syscall 0.5.17", + "redox_syscall", ] [[package]] @@ -3979,7 +4037,7 @@ checksum = "5297962ef19edda4ce33aaa484386e0a5b3d7f2f4e037cbeee00503ef6b29d33" dependencies = [ "anstream", "anstyle", - "clap 4.5.48", + "clap 4.5.51", "escape8259", ] @@ -4000,17 +4058,16 @@ checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" [[package]] name = "litemap" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" [[package]] name = "lock_api" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" dependencies = [ - "autocfg", "scopeguard", ] @@ -4064,9 +4121,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.5" +version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" [[package]] name = "memoffset" @@ -4120,13 +4177,13 @@ dependencies = [ [[package]] name = "mio" -version = "1.0.4" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" +checksum = "69d83b0086dc8ecf3ce9ae2874b2d1290252e2a30720bea58a5c6639b0092873" dependencies = [ "libc", "wasi", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -4150,7 +4207,7 @@ version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "cfg-if", "cfg_aliases", "libc", @@ -4177,11 +4234,11 @@ dependencies = [ [[package]] name = "nu-ansi-term" -version = "0.50.1" +version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4a28e057d01f97e61255210fcff094d74ed0466038633e95017f5beb68e4399" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -4267,23 +4324,32 @@ dependencies = [ [[package]] name = "objc2-core-foundation" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c10c2894a6fed806ade6027bcd50662746363a9589d3ec9d9bef30a4e4bc166" +checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", ] [[package]] name = "objc2-io-kit" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71c1c64d6120e51cd86033f67176b1cb66780c2efe34dec55176f77befd93c0a" +checksum = "33fafba39597d6dc1fb709123dfa8289d39406734be322956a69f0931c73bb15" dependencies = [ "libc", "objc2-core-foundation", ] +[[package]] +name = "object" +version = "0.32.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +dependencies = [ + "memchr", +] + [[package]] name = "object_store" version = "0.12.4" @@ -4329,9 +4395,9 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "once_cell_polyfill" -version = "1.70.1" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "oorandom" @@ -4368,15 +4434,15 @@ checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" [[package]] name = "owo-colors" -version = "4.2.2" +version = "4.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48dd4f4a2c8405440fd0462561f0e5806bd0f77e86f51c761481bdd4018b545e" +checksum = "9c6901729fa79e91a0913333229e9ca5dc725089d1c363b2f4b4760709dc4a52" [[package]] name = "parking_lot" -version = "0.12.4" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70d58bf43669b5795d1576d0641cfb6fbb2057bf629506267a92807158584a13" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" dependencies = [ "lock_api", "parking_lot_core", @@ -4384,22 +4450,22 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.11" +version = "0.9.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.5.17", + "redox_syscall", "smallvec", - "windows-targets 0.52.6", + "windows-link 0.2.1", ] [[package]] name = "parquet" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dbd48ad52d7dccf8ea1b90a3ddbfaea4f69878dd7683e51c507d4bc52b5b27" +checksum = "7a0f31027ef1af7549f7cec603a9a21dce706d3f8d7c2060a68f43c1773be95a" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -4418,8 +4484,9 @@ dependencies = [ "half", "hashbrown 0.16.0", "lz4_flex", - "num", "num-bigint", + "num-integer", + "num-traits", "object_store", "paste", "ring", @@ -4454,7 +4521,7 @@ dependencies = [ "regex", "regex-syntax", "structmeta", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -4465,26 +4532,14 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pbjson" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7e6349fa080353f4a597daffd05cb81572a9c031a6d4fff7e504947496fcc68" +checksum = "898bac3fa00d0ba57a4e8289837e965baa2dee8c3749f3b11d45a64b4223d9c3" dependencies = [ - "base64 0.21.7", + "base64 0.22.1", "serde", ] -[[package]] -name = "pbjson-build" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6eea3058763d6e656105d1403cb04e0a41b7bbac6362d413e7c33be0c32279c9" -dependencies = [ - "heck 0.5.0", - "itertools 0.13.0", - "prost 0.13.5", - "prost-types 0.13.5", -] - [[package]] name = "pbjson-build" version = "0.8.0" @@ -4493,22 +4548,22 @@ checksum = "af22d08a625a2213a78dbb0ffa253318c5c79ce3133d32d296655a7bdfb02095" dependencies = [ "heck 0.5.0", "itertools 0.14.0", - "prost 0.14.1", - "prost-types 0.14.1", + "prost", + "prost-types", ] [[package]] name = "pbjson-types" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e54e5e7bfb1652f95bc361d76f3c780d8e526b134b85417e774166ee941f0887" +checksum = "8e748e28374f10a330ee3bb9f29b828c0ac79831a32bab65015ad9b661ead526" dependencies = [ "bytes", "chrono", "pbjson", - "pbjson-build 0.7.0", - "prost 0.13.5", - "prost-build 0.13.5", + "pbjson-build", + "prost", + "prost-build", "serde", ] @@ -4594,7 +4649,7 @@ checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -4667,7 +4722,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -4690,9 +4745,9 @@ dependencies = [ [[package]] name = "postgres-types" -version = "0.2.10" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a120daaabfcb0e324d5bf6e411e9222994cb3795c79943a0ef28ed27ea76e4" +checksum = "ef4605b7c057056dd35baeb6ac0c0338e4975b1f2bef0f65da953285eb007095" dependencies = [ "bytes", "chrono", @@ -4703,9 +4758,9 @@ dependencies = [ [[package]] name = "potential_utf" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" dependencies = [ "zerovec", ] @@ -4742,7 +4797,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -4780,23 +4835,13 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.101" +version = "1.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" dependencies = [ "unicode-ident", ] -[[package]] -name = "prost" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" -dependencies = [ - "bytes", - "prost-derive 0.13.5", -] - [[package]] name = "prost" version = "0.14.1" @@ -4804,27 +4849,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7231bd9b3d3d33c86b58adbac74b5ec0ad9f496b19d22801d773636feaa95f3d" dependencies = [ "bytes", - "prost-derive 0.14.1", -] - -[[package]] -name = "prost-build" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" -dependencies = [ - "heck 0.5.0", - "itertools 0.14.0", - "log", - "multimap", - "once_cell", - "petgraph 0.7.1", - "prettyplease", - "prost 0.13.5", - "prost-types 0.13.5", - "regex", - "syn 2.0.106", - "tempfile", + "prost-derive", ] [[package]] @@ -4840,26 +4865,13 @@ dependencies = [ "once_cell", "petgraph 0.7.1", "prettyplease", - "prost 0.14.1", - "prost-types 0.14.1", + "prost", + "prost-types", "regex", - "syn 2.0.106", + "syn 2.0.110", "tempfile", ] -[[package]] -name = "prost-derive" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" -dependencies = [ - "anyhow", - "itertools 0.14.0", - "proc-macro2", - "quote", - "syn 2.0.106", -] - [[package]] name = "prost-derive" version = "0.14.1" @@ -4870,16 +4882,7 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.106", -] - -[[package]] -name = "prost-types" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" -dependencies = [ - "prost 0.13.5", + "syn 2.0.110", ] [[package]] @@ -4888,7 +4891,7 @@ version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9b4db3d6da204ed77bb26ba83b6122a73aeb2e87e25fbf7ad2e84c4ccbf8f72" dependencies = [ - "prost 0.14.1", + "prost", ] [[package]] @@ -4902,10 +4905,11 @@ dependencies = [ [[package]] name = "psm" -version = "0.1.26" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e944464ec8536cd1beb0bbfd96987eb5e3b72f2ecdafdc5c769a37f1fa2ae1f" +checksum = "d11f2fedc3b7dafdc2851bc52f277377c5473d378859be234bc7ebb593144d01" dependencies = [ + "ar_archive_writer", "cc", ] @@ -4931,9 +4935,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8970a78afe0628a3e3430376fc5fd76b6b45c4d43360ffd6cdd40bdde72b682a" +checksum = "7ba0117f4212101ee6544044dae45abe1083d30ce7b29c4b5cbdfa2354e07383" dependencies = [ "indoc", "libc", @@ -4948,19 +4952,18 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "458eb0c55e7ece017adeba38f2248ff3ac615e53660d7c71a238d7d2a01c7598" +checksum = "4fc6ddaf24947d12a9aa31ac65431fb1b851b8f4365426e182901eabfb87df5f" dependencies = [ - "once_cell", "target-lexicon", ] [[package]] name = "pyo3-ffi" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7114fe5457c61b276ab77c5055f206295b812608083644a5c5b2640c3102565c" +checksum = "025474d3928738efb38ac36d4744a74a400c901c7596199e20e45d98eb194105" dependencies = [ "libc", "pyo3-build-config", @@ -4968,27 +4971,27 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8725c0a622b374d6cb051d11a0983786448f7785336139c3c94f5aa6bef7e50" +checksum = "2e64eb489f22fe1c95911b77c44cc41e7c19f3082fc81cce90f657cdc42ffded" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "pyo3-macros-backend" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4109984c22491085343c05b0dbc54ddc405c3cf7b4374fc533f5c3313a572ccc" +checksum = "100246c0ecf400b475341b8455a9213344569af29a3c841d29270e53102e0fcf" dependencies = [ "heck 0.5.0", "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -5020,7 +5023,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", - "socket2 0.6.0", + "socket2", "thiserror", "tokio", "tracing", @@ -5057,16 +5060,16 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.0", + "socket2", "tracing", "windows-sys 0.60.2", ] [[package]] name = "quote" -version = "1.0.41" +version = "1.0.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" +checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" dependencies = [ "proc-macro2", ] @@ -5199,25 +5202,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "redox_syscall" -version = "0.3.5" +version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags 1.3.2", -] - -[[package]] -name = "redox_syscall" -version = "0.5.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" -dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", ] [[package]] @@ -5233,29 +5227,29 @@ dependencies = [ [[package]] name = "ref-cast" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a0ae411dbe946a674d89546582cea4ba2bb8defac896622d6496f14c23ba5cf" +checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" dependencies = [ "ref-cast-impl", ] [[package]] name = "ref-cast-impl" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1165225c21bff1f3bbce98f5a1f889949bc902d3575308cc7b0de30b4f6d27c7" +checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "regex" -version = "1.11.3" +version = "1.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b5288124840bee7b386bc413c487869b360b2b4ec421ea56425128692f2a82c" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" dependencies = [ "aho-corasick", "memchr", @@ -5265,9 +5259,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.11" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "833eb9ce86d40ef33cb1306d8accf7bc8ec2bfea4355cbdebb3df68b40925cad" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" dependencies = [ "aho-corasick", "memchr", @@ -5276,23 +5270,23 @@ dependencies = [ [[package]] name = "regex-lite" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "943f41321c63ef1c92fd763bfe054d2668f7f225a5c29f0105903dc2fc04ba30" +checksum = "8d942b98df5e658f56f20d592c7f868833fe38115e65c33003d8cd224b0155da" [[package]] name = "regex-syntax" -version = "0.8.6" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" [[package]] name = "regress" -version = "0.10.4" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145bb27393fe455dd64d6cbc8d059adfa392590a45eadf079c01b11857e7b010" +checksum = "2057b2325e68a893284d1538021ab90279adac1139957ca2a74426c6f118fb48" dependencies = [ - "hashbrown 0.15.5", + "hashbrown 0.16.0", "memchr", ] @@ -5322,9 +5316,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.23" +version = "0.12.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d429f34c8092b2d42c7c93cec323bb4adeb7c67698f70839adec842ec10c7ceb" +checksum = "9d0946410b9f7b082a427e4ef5c8ff541a88b357bc6c637c40db3a68ac70a36f" dependencies = [ "base64 0.22.1", "bytes", @@ -5407,21 +5401,20 @@ dependencies = [ [[package]] name = "rstest" -version = "0.25.0" +version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fc39292f8613e913f7df8fa892b8944ceb47c247b78e1b1ae2f09e019be789d" +checksum = "f5a3193c063baaa2a95a33f03035c8a72b83d97a54916055ba22d35ed3839d49" dependencies = [ "futures-timer", "futures-util", "rstest_macros", - "rustc_version", ] [[package]] name = "rstest_macros" -version = "0.25.0" +version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f168d99749d307be9de54d23fd226628d99768225ef08f6ffb52e0182a27746" +checksum = "9c845311f0ff7951c5506121a9ad75aec44d083c31583b2ea5a30bcb0b0abba0" dependencies = [ "cfg-if", "glob", @@ -5431,7 +5424,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.106", + "syn 2.0.110", "unicode-ident", ] @@ -5443,14 +5436,14 @@ checksum = "b3a8fb4672e840a587a66fc577a5491375df51ddb88f2a2c2a792598c326fe14" dependencies = [ "quote", "rand 0.8.5", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "rust_decimal" -version = "1.38.0" +version = "1.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8975fc98059f365204d635119cf9c5a60ae67b841ed49b5422a9a7e56cdfac0" +checksum = "35affe401787a9bd846712274d97654355d21b2a2c092a3139aabe31e9022282" dependencies = [ "arrayvec", "borsh", @@ -5484,20 +5477,21 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "errno", "libc", "linux-raw-sys", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] name = "rustls" -version = "0.23.32" +version = "0.23.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd3c25631629d034ce7cd9940adc9d45762d46de2b0f57193c4443b92c6d4d40" +checksum = "533f54bc6a7d4f647e46ad909549eda97bf5afc1585190ef692b4286b198bd8f" dependencies = [ "aws-lc-rs", + "log", "once_cell", "ring", "rustls-pki-types", @@ -5508,9 +5502,9 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3" +checksum = "9980d917ebb0c0536119ba501e90834767bffc3d60641457fd84a1f3fd337923" dependencies = [ "openssl-probe", "rustls-pki-types", @@ -5529,9 +5523,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.12.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" +checksum = "94182ad936a0c91c324cd46c6511b9510ed16af436d7b5bab34beab0afd55f7a" dependencies = [ "web-time", "zeroize", @@ -5539,9 +5533,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.6" +version = "0.103.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8572f3c2cb9934231157b45499fc41e1f58c589fdfb81a844ba873265e80f8eb" +checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" dependencies = [ "aws-lc-rs", "ring", @@ -5561,7 +5555,7 @@ version = "17.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e902948a25149d50edc1a8e0141aad50f54e22ba83ff988cf8f7c9ef07f50564" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "cfg-if", "clipboard-win", "fd-lock", @@ -5572,7 +5566,7 @@ dependencies = [ "nix", "radix_trie", "unicode-segmentation", - "unicode-width 0.2.1", + "unicode-width 0.2.2", "utf8parse", "windows-sys 0.60.2", ] @@ -5598,7 +5592,7 @@ version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" dependencies = [ - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -5627,9 +5621,9 @@ dependencies = [ [[package]] name = "schemars" -version = "1.0.4" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82d20c4491bc164fa2f6c5d44565947a52ad80b9505d8e36f8d54c27c739fcd0" +checksum = "9558e172d4e8533736ba97870c4b2cd63f84b382a3d6eb063da41b91cce17289" dependencies = [ "dyn-clone", "ref-cast", @@ -5646,7 +5640,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -5663,11 +5657,11 @@ checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" [[package]] name = "security-framework" -version = "3.5.0" +version = "3.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc198e42d9b7510827939c9a15f5062a0c913f3371d765977e586d2fe6c16f4a" +checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "core-foundation", "core-foundation-sys", "libc", @@ -5737,7 +5731,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -5748,7 +5742,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -5772,7 +5766,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -5784,7 +5778,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -5801,9 +5795,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.14.1" +version = "3.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c522100790450cf78eeac1507263d0a350d4d5b30df0c8e1fe051a10c22b376e" +checksum = "aa66c845eee442168b2c8134fec70ac50dc20e760769c8ba0ad1319ca1959b04" dependencies = [ "base64 0.22.1", "chrono", @@ -5811,9 +5805,8 @@ dependencies = [ "indexmap 1.9.3", "indexmap 2.12.0", "schemars 0.9.0", - "schemars 1.0.4", - "serde", - "serde_derive", + "schemars 1.1.0", + "serde_core", "serde_json", "serde_with_macros", "time", @@ -5821,14 +5814,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.14.1" +version = "3.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "327ada00f7d64abaac1e55a6911e90cf665aa051b9a561c7006c157f4633135e" +checksum = "b91a903660542fced4e99881aa481bdbaec1634568ee02e0b8bd57c64cb38955" dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -5952,22 +5945,12 @@ dependencies = [ [[package]] name = "socket2" -version = "0.5.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - -[[package]] -name = "socket2" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" +checksum = "17129e116933cf371d018bb80ae557e889637989d8638274fb25622827b03881" dependencies = [ "libc", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -6014,20 +5997,20 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "stable_deref_trait" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" [[package]] name = "stacker" -version = "0.1.21" +version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cddb07e32ddb770749da91081d8d0ac3a16f1a569a18b20348cd371f5dead06b" +checksum = "e1f8b29fb42aafcea4edeeb6b2f2d7ecd0d969c48b4cf0d2e64aafc471dd6e59" dependencies = [ "cc", "cfg-if", @@ -6062,7 +6045,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -6073,7 +6056,7 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -6100,31 +6083,12 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "strum" -version = "0.26.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" - [[package]] name = "strum" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" -[[package]] -name = "strum_macros" -version = "0.26.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" -dependencies = [ - "heck 0.5.0", - "proc-macro2", - "quote", - "rustversion", - "syn 2.0.106", -] - [[package]] name = "strum_macros" version = "0.27.2" @@ -6134,7 +6098,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -6149,18 +6113,18 @@ dependencies = [ [[package]] name = "substrait" -version = "0.58.0" +version = "0.62.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de6d24c270c6c672a86c183c3a8439ba46c1936f93cf7296aa692de3b0ff0228" +checksum = "21f1cb6d0bcd097a39fc25f7236236be29881fe122e282e4173d6d007a929927" dependencies = [ "heck 0.5.0", "pbjson", - "pbjson-build 0.7.0", + "pbjson-build", "pbjson-types", "prettyplease", - "prost 0.13.5", - "prost-build 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-build", + "prost-types", "protobuf-src", "regress", "schemars 0.8.22", @@ -6168,7 +6132,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.106", + "syn 2.0.110", "typify", "walkdir", ] @@ -6192,9 +6156,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.106" +version = "2.0.110" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +checksum = "a99801b5bd34ede4cf3fc688c5919368fea4e4814a4664359503e6015b280aea" dependencies = [ "proc-macro2", "quote", @@ -6218,7 +6182,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -6257,7 +6221,7 @@ dependencies = [ "getrandom 0.3.4", "once_cell", "rustix", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -6273,13 +6237,13 @@ dependencies = [ [[package]] name = "testcontainers" -version = "0.24.0" +version = "0.25.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23bb7577dca13ad86a78e8271ef5d322f37229ec83b8d98da6d996c588a1ddb1" +checksum = "3f3ac71069f20ecfa60c396316c283fbf35e6833a53dff551a31b5458da05edc" dependencies = [ + "astral-tokio-tar", "async-trait", "bollard", - "bollard-stubs", "bytes", "docker_credential", "either", @@ -6295,16 +6259,16 @@ dependencies = [ "thiserror", "tokio", "tokio-stream", - "tokio-tar", "tokio-util", + "ulid", "url", ] [[package]] name = "testcontainers-modules" -version = "0.12.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac95cde96549fc19c6bf19ef34cc42bd56e264c1cb97e700e21555be0ecf9e2" +checksum = "1966329d5bb3f89d33602d2db2da971fb839f9297dad16527abf4564e2ae0a6d" dependencies = [ "testcontainers", ] @@ -6335,7 +6299,7 @@ checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -6400,9 +6364,9 @@ dependencies = [ [[package]] name = "tinystr" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" dependencies = [ "displaydoc", "zerovec", @@ -6445,9 +6409,9 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2 0.6.0", + "socket2", "tokio-macros", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -6458,14 +6422,14 @@ checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "tokio-postgres" -version = "0.7.14" +version = "0.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a156efe7fff213168257853e1dfde202eed5f487522cbbbf7d219941d753d853" +checksum = "2b40d66d9b2cfe04b628173409368e58247e8eddbbd3b0e6c6ba1d09f20f6c9e" dependencies = [ "async-trait", "byteorder", @@ -6481,7 +6445,7 @@ dependencies = [ "postgres-protocol", "postgres-types", "rand 0.9.2", - "socket2 0.6.0", + "socket2", "tokio", "tokio-util", "whoami", @@ -6489,9 +6453,9 @@ dependencies = [ [[package]] name = "tokio-rustls" -version = "0.26.3" +version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f63835928ca123f1bef57abbcd23bb2ba0ac9ae1235f1e65bda0d06e7786bd" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" dependencies = [ "rustls", "tokio", @@ -6508,26 +6472,11 @@ dependencies = [ "tokio", ] -[[package]] -name = "tokio-tar" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d5714c010ca3e5c27114c1cdeb9d14641ace49874aa5626d7149e47aedace75" -dependencies = [ - "filetime", - "futures-core", - "libc", - "redox_syscall 0.3.5", - "tokio", - "tokio-stream", - "xattr", -] - [[package]] name = "tokio-util" -version = "0.7.16" +version = "0.7.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" +checksum = "2efa149fe76073d6e8fd97ef4f4eca7b67f599660115591483572e406e165594" dependencies = [ "bytes", "futures-core", @@ -6538,18 +6487,18 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32f1085dec27c2b6632b04c80b3bb1b4300d6495d1e129693bdda7d91e72eec1" +checksum = "f2cdb639ebbc97961c51720f858597f7f24c4fc295327923af55b74c3c724533" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.23.6" +version = "0.23.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3effe7c0e86fdff4f69cdd2ccc1b96f933e24811c5441d44904e8683e27184b" +checksum = "6485ef6d0d9b5d0ec17244ff7eb05310113c3f316f2d14200d4de56b3cb98f8d" dependencies = [ "indexmap 2.12.0", "toml_datetime", @@ -6559,18 +6508,18 @@ dependencies = [ [[package]] name = "toml_parser" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cf893c33be71572e0e9aa6dd15e6677937abd686b066eac3f8cd3531688a627" +checksum = "c0cbe268d35bdb4bb5a56a2de88d0ad0eb70af5384a99d648cd4b3d04039800e" dependencies = [ "winnow", ] [[package]] name = "tonic" -version = "0.13.1" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9" +checksum = "eb7613188ce9f7df5bfe185db26c5814347d110db17920415cf2fbcad85e7203" dependencies = [ "async-trait", "axum", @@ -6585,8 +6534,8 @@ dependencies = [ "hyper-util", "percent-encoding", "pin-project", - "prost 0.13.5", - "socket2 0.5.10", + "socket2", + "sync_wrapper", "tokio", "tokio-stream", "tower", @@ -6595,6 +6544,17 @@ dependencies = [ "tracing", ] +[[package]] +name = "tonic-prost" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66bd50ad6ce1252d87ef024b3d64fe4c3cf54a86fb9ef4c631fdd0ded7aeaa67" +dependencies = [ + "bytes", + "prost", + "tonic", +] + [[package]] name = "tower" version = "0.5.2" @@ -6620,7 +6580,7 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "bytes", "futures-util", "http 1.3.1", @@ -6663,7 +6623,7 @@ checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -6736,9 +6696,9 @@ checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" [[package]] name = "typenum" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" [[package]] name = "typewit" @@ -6748,9 +6708,9 @@ checksum = "f8c1ae7cc0fdb8b842d65d127cb981574b0d2b249b74d1c7a2986863dc134f71" [[package]] name = "typify" -version = "0.4.3" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7144144e97e987c94758a3017c920a027feac0799df325d6df4fc8f08d02068e" +checksum = "e6d5bcc6f62eb1fa8aa4098f39b29f93dcb914e17158b76c50360911257aa629" dependencies = [ "typify-impl", "typify-macro", @@ -6758,9 +6718,9 @@ dependencies = [ [[package]] name = "typify-impl" -version = "0.4.3" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "062879d46aa4c9dfe0d33b035bbaf512da192131645d05deacb7033ec8581a09" +checksum = "a1eb359f7ffa4f9ebe947fa11a1b2da054564502968db5f317b7e37693cb2240" dependencies = [ "heck 0.5.0", "log", @@ -6771,16 +6731,16 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.106", + "syn 2.0.110", "thiserror", "unicode-ident", ] [[package]] name = "typify-macro" -version = "0.4.3" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9708a3ceb6660ba3f8d2b8f0567e7d4b8b198e2b94d093b8a6077a751425de9e" +checksum = "911c32f3c8514b048c1b228361bebb5e6d73aeec01696e8cc0e82e2ffef8ab7a" dependencies = [ "proc-macro2", "quote", @@ -6789,10 +6749,20 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.106", + "syn 2.0.110", "typify-impl", ] +[[package]] +name = "ulid" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "470dbf6591da1b39d43c14523b2b469c86879a53e8b758c8e090a470fe7b1fbe" +dependencies = [ + "rand 0.9.2", + "web-time", +] + [[package]] name = "unicode-bidi" version = "0.3.18" @@ -6801,24 +6771,24 @@ checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" [[package]] name = "unicode-ident" -version = "1.0.19" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" +checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" [[package]] name = "unicode-normalization" -version = "0.1.24" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" dependencies = [ "tinyvec", ] [[package]] name = "unicode-properties" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0" +checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" [[package]] name = "unicode-segmentation" @@ -6834,9 +6804,9 @@ checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" [[package]] name = "unicode-width" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" [[package]] name = "unindent" @@ -6862,6 +6832,34 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "ureq" +version = "3.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d39cb1dbab692d82a977c0392ffac19e188bd9186a9f32806f0aaa859d75585a" +dependencies = [ + "base64 0.22.1", + "log", + "percent-encoding", + "rustls", + "rustls-pki-types", + "ureq-proto", + "utf-8", + "webpki-roots", +] + +[[package]] +name = "ureq-proto" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b4531c118335662134346048ddb0e54cc86bd7e81866757873055f0e38f5d2" +dependencies = [ + "base64 0.22.1", + "http 1.3.1", + "httparse", + "log", +] + [[package]] name = "url" version = "2.5.7" @@ -6880,6 +6878,12 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf8_iter" version = "1.0.4" @@ -6964,9 +6968,9 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" [[package]] name = "wasm-bindgen" -version = "0.2.104" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d" +checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60" dependencies = [ "cfg-if", "once_cell", @@ -6975,25 +6979,11 @@ dependencies = [ "wasm-bindgen-shared", ] -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.104" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19" -dependencies = [ - "bumpalo", - "log", - "proc-macro2", - "quote", - "syn 2.0.106", - "wasm-bindgen-shared", -] - [[package]] name = "wasm-bindgen-futures" -version = "0.4.54" +version = "0.4.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e038d41e478cc73bae0ff9b36c60cff1c98b8f38f8d7e8061e79ee63608ac5c" +checksum = "551f88106c6d5e7ccc7cd9a16f312dd3b5d36ea8b4954304657d5dfba115d4a0" dependencies = [ "cfg-if", "js-sys", @@ -7004,9 +6994,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.104" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119" +checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -7014,31 +7004,31 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.104" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" +checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc" dependencies = [ + "bumpalo", "proc-macro2", "quote", - "syn 2.0.106", - "wasm-bindgen-backend", + "syn 2.0.110", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.104" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1" +checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76" dependencies = [ "unicode-ident", ] [[package]] name = "wasm-bindgen-test" -version = "0.3.54" +version = "0.3.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e381134e148c1062f965a42ed1f5ee933eef2927c3f70d1812158f711d39865" +checksum = "bfc379bfb624eb59050b509c13e77b4eb53150c350db69628141abce842f2373" dependencies = [ "js-sys", "minicov", @@ -7049,13 +7039,13 @@ dependencies = [ [[package]] name = "wasm-bindgen-test-macro" -version = "0.3.54" +version = "0.3.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b673bca3298fe582aeef8352330ecbad91849f85090805582400850f8270a2e8" +checksum = "085b2df989e1e6f9620c1311df6c996e83fe16f57792b272ce1e024ac16a90f1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -7073,9 +7063,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.81" +version = "0.3.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120" +checksum = "3a1f95c0d03a47f4ae1f7a64643a6bb97465d9b740f0fa8f90ea33915c99a9a1" dependencies = [ "js-sys", "wasm-bindgen", @@ -7091,6 +7081,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki-roots" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2878ef029c47c6e8cf779119f20fcf52bde7ad42a731b2a304bc221df17571e" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "whoami" version = "1.6.1" @@ -7124,7 +7123,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -7170,15 +7169,15 @@ dependencies = [ [[package]] name = "windows-core" -version = "0.62.0" +version = "0.62.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57fe7168f7de578d2d8a05b07fd61870d2e73b4020e9f49aa00da8471723497c" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" dependencies = [ "windows-implement", "windows-interface", - "windows-link 0.2.0", - "windows-result 0.4.0", - "windows-strings 0.5.0", + "windows-link 0.2.1", + "windows-result 0.4.1", + "windows-strings 0.5.1", ] [[package]] @@ -7194,24 +7193,24 @@ dependencies = [ [[package]] name = "windows-implement" -version = "0.60.0" +version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] name = "windows-interface" -version = "0.59.1" +version = "0.59.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -7222,9 +7221,9 @@ checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" [[package]] name = "windows-link" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45e46c0661abb7180e7b9c281db115305d49ca1709ab8242adf09666d2173c65" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" [[package]] name = "windows-numerics" @@ -7247,11 +7246,11 @@ dependencies = [ [[package]] name = "windows-result" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7084dcc306f89883455a206237404d3eaf961e5bd7e0f312f7c91f57eb44167f" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" dependencies = [ - "windows-link 0.2.0", + "windows-link 0.2.1", ] [[package]] @@ -7265,11 +7264,11 @@ dependencies = [ [[package]] name = "windows-strings" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7218c655a553b0bed4426cf54b20d7ba363ef543b52d515b3e48d7fd55318dda" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" dependencies = [ - "windows-link 0.2.0", + "windows-link 0.2.1", ] [[package]] @@ -7296,16 +7295,16 @@ version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "windows-targets 0.53.3", + "windows-targets 0.53.5", ] [[package]] name = "windows-sys" -version = "0.61.0" +version = "0.61.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e201184e40b2ede64bc2ea34968b28e33622acdbbf37104f0e4a33f7abe657aa" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" dependencies = [ - "windows-link 0.2.0", + "windows-link 0.2.1", ] [[package]] @@ -7326,19 +7325,19 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.53.3" +version = "0.53.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" dependencies = [ - "windows-link 0.1.3", - "windows_aarch64_gnullvm 0.53.0", - "windows_aarch64_msvc 0.53.0", - "windows_i686_gnu 0.53.0", - "windows_i686_gnullvm 0.53.0", - "windows_i686_msvc 0.53.0", - "windows_x86_64_gnu 0.53.0", - "windows_x86_64_gnullvm 0.53.0", - "windows_x86_64_msvc 0.53.0", + "windows-link 0.2.1", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", ] [[package]] @@ -7358,9 +7357,9 @@ checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_gnullvm" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" [[package]] name = "windows_aarch64_msvc" @@ -7370,9 +7369,9 @@ checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_aarch64_msvc" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" [[package]] name = "windows_i686_gnu" @@ -7382,9 +7381,9 @@ checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] name = "windows_i686_gnu" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" [[package]] name = "windows_i686_gnullvm" @@ -7394,9 +7393,9 @@ checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_gnullvm" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" [[package]] name = "windows_i686_msvc" @@ -7406,9 +7405,9 @@ checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_i686_msvc" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" [[package]] name = "windows_x86_64_gnu" @@ -7418,9 +7417,9 @@ checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnu" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" [[package]] name = "windows_x86_64_gnullvm" @@ -7430,9 +7429,9 @@ checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_gnullvm" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" [[package]] name = "windows_x86_64_msvc" @@ -7442,9 +7441,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "windows_x86_64_msvc" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" [[package]] name = "winnow" @@ -7463,9 +7462,9 @@ checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" [[package]] name = "writeable" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" [[package]] name = "wyz" @@ -7509,11 +7508,10 @@ checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" [[package]] name = "yoke" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" dependencies = [ - "serde", "stable_deref_trait", "yoke-derive", "zerofrom", @@ -7521,13 +7519,13 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", "synstructure", ] @@ -7548,7 +7546,7 @@ checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] @@ -7568,21 +7566,21 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", "synstructure", ] [[package]] name = "zeroize" -version = "1.8.1" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" [[package]] name = "zerotrie" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" dependencies = [ "displaydoc", "yoke", @@ -7591,9 +7589,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.4" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" dependencies = [ "yoke", "zerofrom", @@ -7602,13 +7600,13 @@ dependencies = [ [[package]] name = "zerovec-derive" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.110", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 3e0861c07ab0..36198430e40b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -77,9 +77,9 @@ license = "Apache-2.0" readme = "README.md" repository = "https://github.com/apache/datafusion" # Define Minimum Supported Rust Version (MSRV) -rust-version = "1.87.0" +rust-version = "1.88.0" # Define DataFusion version -version = "50.3.0" +version = "51.0.0" [workspace.dependencies] # We turn off default-features for some dependencies here so the workspaces which inherit them can @@ -91,63 +91,63 @@ ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } apache-avro = { version = "0.20", default-features = false } -arrow = { version = "56.2.0", features = [ +arrow = { version = "57.0.0", features = [ "prettyprint", "chrono-tz", ] } -arrow-buffer = { version = "56.2.0", default-features = false } -arrow-flight = { version = "56.2.0", features = [ +arrow-buffer = { version = "57.0.0", default-features = false } +arrow-flight = { version = "57.0.0", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "56.2.0", default-features = false, features = [ +arrow-ipc = { version = "57.0.0", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "56.2.0", default-features = false } -arrow-schema = { version = "56.2.0", default-features = false } +arrow-ord = { version = "57.0.0", default-features = false } +arrow-schema = { version = "57.0.0", default-features = false } async-trait = "0.1.89" bigdecimal = "0.4.8" bytes = "1.10" chrono = { version = "0.4.42", default-features = false } -criterion = "0.5.1" -ctor = "0.4.3" +criterion = "0.7" +ctor = "0.6.1" dashmap = "6.0.1" -datafusion = { path = "datafusion/core", version = "50.3.0", default-features = false } -datafusion-catalog = { path = "datafusion/catalog", version = "50.3.0" } -datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "50.3.0" } -datafusion-common = { path = "datafusion/common", version = "50.3.0", default-features = false } -datafusion-common-runtime = { path = "datafusion/common-runtime", version = "50.3.0" } -datafusion-datasource = { path = "datafusion/datasource", version = "50.3.0", default-features = false } -datafusion-datasource-arrow = { path = "datafusion/datasource-arrow", version = "50.3.0", default-features = false } -datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "50.3.0", default-features = false } -datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "50.3.0", default-features = false } -datafusion-datasource-json = { path = "datafusion/datasource-json", version = "50.3.0", default-features = false } -datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "50.3.0", default-features = false } -datafusion-doc = { path = "datafusion/doc", version = "50.3.0" } -datafusion-execution = { path = "datafusion/execution", version = "50.3.0", default-features = false } -datafusion-expr = { path = "datafusion/expr", version = "50.3.0", default-features = false } -datafusion-expr-common = { path = "datafusion/expr-common", version = "50.3.0" } -datafusion-ffi = { path = "datafusion/ffi", version = "50.3.0" } -datafusion-functions = { path = "datafusion/functions", version = "50.3.0" } -datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "50.3.0" } -datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "50.3.0" } -datafusion-functions-nested = { path = "datafusion/functions-nested", version = "50.3.0", default-features = false } -datafusion-functions-table = { path = "datafusion/functions-table", version = "50.3.0" } -datafusion-functions-window = { path = "datafusion/functions-window", version = "50.3.0" } -datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "50.3.0" } -datafusion-macros = { path = "datafusion/macros", version = "50.3.0" } -datafusion-optimizer = { path = "datafusion/optimizer", version = "50.3.0", default-features = false } -datafusion-physical-expr = { path = "datafusion/physical-expr", version = "50.3.0", default-features = false } -datafusion-physical-expr-adapter = { path = "datafusion/physical-expr-adapter", version = "50.3.0", default-features = false } -datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "50.3.0", default-features = false } -datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "50.3.0" } -datafusion-physical-plan = { path = "datafusion/physical-plan", version = "50.3.0" } -datafusion-proto = { path = "datafusion/proto", version = "50.3.0" } -datafusion-proto-common = { path = "datafusion/proto-common", version = "50.3.0" } -datafusion-pruning = { path = "datafusion/pruning", version = "50.3.0" } -datafusion-session = { path = "datafusion/session", version = "50.3.0" } -datafusion-spark = { path = "datafusion/spark", version = "50.3.0" } -datafusion-sql = { path = "datafusion/sql", version = "50.3.0" } -datafusion-substrait = { path = "datafusion/substrait", version = "50.3.0" } +datafusion = { path = "datafusion/core", version = "51.0.0", default-features = false } +datafusion-catalog = { path = "datafusion/catalog", version = "51.0.0" } +datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "51.0.0" } +datafusion-common = { path = "datafusion/common", version = "51.0.0", default-features = false } +datafusion-common-runtime = { path = "datafusion/common-runtime", version = "51.0.0" } +datafusion-datasource = { path = "datafusion/datasource", version = "51.0.0", default-features = false } +datafusion-datasource-arrow = { path = "datafusion/datasource-arrow", version = "51.0.0", default-features = false } +datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "51.0.0", default-features = false } +datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "51.0.0", default-features = false } +datafusion-datasource-json = { path = "datafusion/datasource-json", version = "51.0.0", default-features = false } +datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "51.0.0", default-features = false } +datafusion-doc = { path = "datafusion/doc", version = "51.0.0" } +datafusion-execution = { path = "datafusion/execution", version = "51.0.0", default-features = false } +datafusion-expr = { path = "datafusion/expr", version = "51.0.0", default-features = false } +datafusion-expr-common = { path = "datafusion/expr-common", version = "51.0.0" } +datafusion-ffi = { path = "datafusion/ffi", version = "51.0.0" } +datafusion-functions = { path = "datafusion/functions", version = "51.0.0" } +datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "51.0.0" } +datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "51.0.0" } +datafusion-functions-nested = { path = "datafusion/functions-nested", version = "51.0.0", default-features = false } +datafusion-functions-table = { path = "datafusion/functions-table", version = "51.0.0" } +datafusion-functions-window = { path = "datafusion/functions-window", version = "51.0.0" } +datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "51.0.0" } +datafusion-macros = { path = "datafusion/macros", version = "51.0.0" } +datafusion-optimizer = { path = "datafusion/optimizer", version = "51.0.0", default-features = false } +datafusion-physical-expr = { path = "datafusion/physical-expr", version = "51.0.0", default-features = false } +datafusion-physical-expr-adapter = { path = "datafusion/physical-expr-adapter", version = "51.0.0", default-features = false } +datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "51.0.0", default-features = false } +datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "51.0.0" } +datafusion-physical-plan = { path = "datafusion/physical-plan", version = "51.0.0" } +datafusion-proto = { path = "datafusion/proto", version = "51.0.0" } +datafusion-proto-common = { path = "datafusion/proto-common", version = "51.0.0" } +datafusion-pruning = { path = "datafusion/pruning", version = "51.0.0" } +datafusion-session = { path = "datafusion/session", version = "51.0.0" } +datafusion-spark = { path = "datafusion/spark", version = "51.0.0" } +datafusion-sql = { path = "datafusion/sql", version = "51.0.0" } +datafusion-substrait = { path = "datafusion/substrait", version = "51.0.0" } doc-comment = "0.3" env_logger = "0.11" @@ -156,29 +156,30 @@ half = { version = "2.7.0", default-features = false } hashbrown = { version = "0.14.5", features = ["raw"] } hex = { version = "0.4.3" } indexmap = "2.12.0" +insta = { version = "1.43.2", features = ["glob", "filters"] } itertools = "0.14" log = "^0.4" +num-traits = { version = "0.2" } object_store = { version = "0.12.4", default-features = false } parking_lot = "0.12" -parquet = { version = "56.2.0", default-features = false, features = [ +parquet = { version = "57.0.0", default-features = false, features = [ "arrow", "async", "object_store", ] } -pbjson = { version = "0.7.0" } -pbjson-types = "0.7" +pbjson = { version = "0.8.0" } +pbjson-types = "0.8" # Should match arrow-flight's version of prost. -insta = { version = "1.43.2", features = ["glob", "filters"] } -prost = "0.13.1" +prost = "0.14.1" rand = "0.9" recursive = "0.1.1" -regex = "1.11" -rstest = "0.25.0" +regex = "1.12" +rstest = "0.26.1" serde_json = "1" sqlparser = { version = "0.59.0", default-features = false, features = ["std", "visitor"] } tempfile = "3" -testcontainers = { version = "0.24", features = ["default"] } -testcontainers-modules = { version = "0.12" } +testcontainers = { version = "0.25.2", features = ["default"] } +testcontainers-modules = { version = "0.13" } tokio = { version = "1.48", features = ["macros", "rt", "sync"] } url = "2.5.7" diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index b3fd520814db..870c826f5581 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -26,6 +26,9 @@ repository = { workspace = true } license = { workspace = true } rust-version = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/ci/scripts/rust_clippy.sh b/ci/scripts/rust_clippy.sh index 1557bd56eab4..6a00ad810956 100755 --- a/ci/scripts/rust_clippy.sh +++ b/ci/scripts/rust_clippy.sh @@ -18,4 +18,4 @@ # under the License. set -ex -cargo clippy --all-targets --workspace --features avro,pyarrow,integration-tests,extended_tests -- -D warnings +cargo clippy --all-targets --workspace --features avro,pyarrow,integration-tests,extended_tests -- -D warnings \ No newline at end of file diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 53744e6c609b..f3069b492352 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -40,7 +40,7 @@ async-trait = { workspace = true } aws-config = "1.8.7" aws-credential-types = "1.2.7" chrono = { workspace = true } -clap = { version = "4.5.47", features = ["cargo", "derive"] } +clap = { version = "4.5.50", features = ["cargo", "derive"] } datafusion = { workspace = true, features = [ "avro", "compression", diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs index 3ec446c51583..d23b12469e38 100644 --- a/datafusion-cli/src/functions.rs +++ b/datafusion-cli/src/functions.rs @@ -419,7 +419,9 @@ impl TableFunctionImpl for ParquetMetadataFunc { stats_max_value_arr.push(None); }; compression_arr.push(format!("{:?}", column.compression())); - encodings_arr.push(format!("{:?}", column.encodings())); + // need to collect into Vec to format + let encodings: Vec<_> = column.encodings().collect(); + encodings_arr.push(format!("{:?}", encodings)); index_page_offset_arr.push(column.index_page_offset()); dictionary_page_offset_arr.push(column.dictionary_page_offset()); data_page_offset_arr.push(column.data_page_offset()); diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index bdb2fdf5198e..09fa8ef15af8 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -497,7 +497,7 @@ mod tests { +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ | filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size | +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ - | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [RLE_DICTIONARY, PLAIN, RLE] | | 4 | 46 | 121 | 123 | + | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [PLAIN, RLE, RLE_DICTIONARY] | | 4 | 46 | 121 | 123 | +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ "#); @@ -510,7 +510,7 @@ mod tests { +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ | filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size | +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ - | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [RLE_DICTIONARY, PLAIN, RLE] | | 4 | 46 | 121 | 123 | + | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [PLAIN, RLE, RLE_DICTIONARY] | | 4 | 46 | 121 | 123 | +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ "#); @@ -532,7 +532,7 @@ mod tests { +-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ | filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size | +-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ - | ../parquet-testing/data/data_index_bloom_encoding_stats.parquet | 0 | 14 | 1 | 163 | 0 | 4 | 14 | "String" | BYTE_ARRAY | Hello | today | 0 | | Hello | today | GZIP(GzipLevel(6)) | [BIT_PACKED, RLE, PLAIN] | | | 4 | 152 | 163 | + | ../parquet-testing/data/data_index_bloom_encoding_stats.parquet | 0 | 14 | 1 | 163 | 0 | 4 | 14 | "String" | BYTE_ARRAY | Hello | today | 0 | | Hello | today | GZIP(GzipLevel(6)) | [PLAIN, RLE, BIT_PACKED] | | | 4 | 152 | 163 | +-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ "#); @@ -592,9 +592,9 @@ mod tests { +-----------------------------------+-----------------+---------------------+------+------------------+ | filename | file_size_bytes | metadata_size_bytes | hits | extra | +-----------------------------------+-----------------+---------------------+------+------------------+ - | alltypes_plain.parquet | 1851 | 10181 | 2 | page_index=false | - | alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true | - | lz4_raw_compressed_larger.parquet | 380836 | 2939 | 2 | page_index=false | + | alltypes_plain.parquet | 1851 | 6957 | 2 | page_index=false | + | alltypes_tiny_pages.parquet | 454233 | 267014 | 2 | page_index=true | + | lz4_raw_compressed_larger.parquet | 380836 | 996 | 2 | page_index=false | +-----------------------------------+-----------------+---------------------+------+------------------+ "); @@ -623,9 +623,9 @@ mod tests { +-----------------------------------+-----------------+---------------------+------+------------------+ | filename | file_size_bytes | metadata_size_bytes | hits | extra | +-----------------------------------+-----------------+---------------------+------+------------------+ - | alltypes_plain.parquet | 1851 | 10181 | 5 | page_index=false | - | alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true | - | lz4_raw_compressed_larger.parquet | 380836 | 2939 | 3 | page_index=false | + | alltypes_plain.parquet | 1851 | 6957 | 5 | page_index=false | + | alltypes_tiny_pages.parquet | 454233 | 267014 | 2 | page_index=true | + | lz4_raw_compressed_larger.parquet | 380836 | 996 | 3 | page_index=false | +-----------------------------------+-----------------+---------------------+------+------------------+ "); diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 68bb5376a1ac..61711f8472eb 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -29,21 +29,12 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true -[[example]] -name = "flight_sql_server" -path = "examples/flight/flight_sql_server.rs" - -[[example]] -name = "flight_server" -path = "examples/flight/flight_server.rs" - -[[example]] -name = "flight_client" -path = "examples/flight/flight_client.rs" - [[example]] name = "dataframe_to_s3" path = "examples/external_dependency/dataframe-to-s3.rs" @@ -52,10 +43,6 @@ path = "examples/external_dependency/dataframe-to-s3.rs" name = "query_aws_s3" path = "examples/external_dependency/query-aws-s3.rs" -[[example]] -name = "custom_file_casts" -path = "examples/custom_file_casts.rs" - [dev-dependencies] arrow = { workspace = true } # arrow_schema is required for record_batch! macro :sad: @@ -81,7 +68,7 @@ serde_json = { workspace = true } tempfile = { workspace = true } test-utils = { path = "../test-utils" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } -tonic = "0.13.1" +tonic = "0.14" tracing = { version = "0.1" } tracing-subscriber = { version = "0.3" } url = { workspace = true } diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index f1bcbcce8200..62e51a790014 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -46,27 +46,28 @@ cargo run --example dataframe ## Single Process -- [`advanced_udaf.rs`](examples/advanced_udaf.rs): Define and invoke a more complicated User Defined Aggregate Function (UDAF) -- [`advanced_udf.rs`](examples/advanced_udf.rs): Define and invoke a more complicated User Defined Scalar Function (UDF) -- [`advanced_udwf.rs`](examples/advanced_udwf.rs): Define and invoke a more complicated User Defined Window Function (UDWF) +- [`examples/udf/advanced_udaf.rs`](examples/udf/advanced_udaf.rs): Define and invoke a more complicated User Defined Aggregate Function (UDAF) +- [`examples/udf/advanced_udf.rs`](examples/udf/advanced_udf.rs): Define and invoke a more complicated User Defined Scalar Function (UDF) +- [`examples/udf/advanced_udwf.rs`](examples/udf/advanced_udwf.rs): Define and invoke a more complicated User Defined Window Function (UDWF) - [`advanced_parquet_index.rs`](examples/advanced_parquet_index.rs): Creates a detailed secondary index that covers the contents of several parquet files -- [`async_udf.rs`](examples/async_udf.rs): Define and invoke an asynchronous User Defined Scalar Function (UDF) +- [`examples/udf/async_udf.rs`](examples/udf/async_udf.rs): Define and invoke an asynchronous User Defined Scalar Function (UDF) - [`analyzer_rule.rs`](examples/analyzer_rule.rs): Use a custom AnalyzerRule to change a query's semantics (row level access control) - [`catalog.rs`](examples/catalog.rs): Register the table into a custom catalog - [`composed_extension_codec`](examples/composed_extension_codec.rs): Example of using multiple extension codecs for serialization / deserialization -- [`csv_sql_streaming.rs`](examples/csv_sql_streaming.rs): Build and run a streaming query plan from a SQL statement against a local CSV file -- [`csv_json_opener.rs`](examples/csv_json_opener.rs): Use low level `FileOpener` APIs to read CSV/JSON into Arrow `RecordBatch`es -- [`custom_datasource.rs`](examples/custom_datasource.rs): Run queries against a custom datasource (TableProvider) -- [`custom_file_casts.rs`](examples/custom_file_casts.rs): Implement custom casting rules to adapt file schemas -- [`custom_file_format.rs`](examples/custom_file_format.rs): Write data to a custom file format +- [`examples/custom_data_source/csv_sql_streaming.rs`](examples/custom_data_source/csv_sql_streaming.rs): Build and run a streaming query plan from a SQL statement against a local CSV file +- [`examples/custom_data_source/csv_json_opener.rs`](examples/custom_data_source/csv_json_opener.rs): Use low level `FileOpener` APIs to read CSV/JSON into Arrow `RecordBatch`es +- [`examples/custom_data_source/custom_datasource.rs`](examples/custom_data_source/custom_datasource.rs): Run queries against a custom datasource (TableProvider) +- [`examples/custom_data_source/custom_file_casts.rs`](examples/custom_data_source/custom_file_casts.rs): Implement custom casting rules to adapt file schemas +- [`examples/custom_data_source/custom_file_format.rs`](examples/custom_data_source/custom_file_format.rs): Write data to a custom file format - [`dataframe-to-s3.rs`](examples/external_dependency/dataframe-to-s3.rs): Run a query using a DataFrame against a parquet file from s3 and writing back to s3 - [`dataframe.rs`](examples/dataframe.rs): Run a query using a DataFrame API against parquet files, csv files, and in-memory data, including multiple subqueries. Also demonstrates the various methods to write out a DataFrame to a table, parquet file, csv file, and json file. +- [`examples/builtin_functions/date_time`](examples/builtin_functions/date_time.rs): Examples of date-time related functions and queries - [`default_column_values.rs`](examples/default_column_values.rs): Implement custom default value handling for missing columns using field metadata and PhysicalExprAdapter - [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert query results (Arrow ArrayRefs) into Rust structs - [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify, analyze and coerce `Expr`s -- [`file_stream_provider.rs`](examples/file_stream_provider.rs): Run a query on `FileStreamProvider` which implements `StreamProvider` for reading and writing to arbitrary stream sources / sinks. -- [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from JDBC clients -- [`function_factory.rs`](examples/function_factory.rs): Register `CREATE FUNCTION` handler to implement SQL macros +- [`examples/custom_data_source/file_stream_provider.rs`](examples/custom_data_source/file_stream_provider.rs): Run a query on `FileStreamProvider` which implements `StreamProvider` for reading and writing to arbitrary stream sources / sinks. +- [`flight/sql_server.rs`](examples/flight/sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from Flight and and FlightSQL (e.g. JDBC) clients +- [`examples/builtin_functions/function_factory.rs`](examples/builtin_functions/function_factory.rs): Register `CREATE FUNCTION` handler to implement SQL macros - [`memory_pool_tracking.rs`](examples/memory_pool_tracking.rs): Demonstrates TrackConsumersPool for memory tracking and debugging with enhanced error messages - [`memory_pool_execution_plan.rs`](examples/memory_pool_execution_plan.rs): Shows how to implement memory-aware ExecutionPlan with memory reservation and spilling - [`optimizer_rule.rs`](examples/optimizer_rule.rs): Use a custom OptimizerRule to replace certain predicates @@ -81,17 +82,17 @@ cargo run --example dataframe - [`pruning.rs`](examples/pruning.rs): Use pruning to rule out files based on statistics - [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3 - [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP -- [`regexp.rs`](examples/regexp.rs): Examples of using regular expression functions +- [`examples/builtin_functions/regexp.rs`](examples/builtin_functions/regexp.rs): Examples of using regular expression functions - [`remote_catalog.rs`](examples/regexp.rs): Examples of interfacing with a remote catalog (e.g. over a network) -- [`simple_udaf.rs`](examples/simple_udaf.rs): Define and invoke a User Defined Aggregate Function (UDAF) -- [`simple_udf.rs`](examples/simple_udf.rs): Define and invoke a User Defined Scalar Function (UDF) -- [`simple_udfw.rs`](examples/simple_udwf.rs): Define and invoke a User Defined Window Function (UDWF) +- [`examples/udf/simple_udaf.rs`](examples/udf/simple_udaf.rs): Define and invoke a User Defined Aggregate Function (UDAF) +- [`examples/udf/simple_udf.rs`](examples/udf/simple_udf.rs): Define and invoke a User Defined Scalar Function (UDF) +- [`examples/udf/simple_udtf.rs`](examples/udf/simple_udtf.rs): Define and invoke a User Defined Table Function (UDTF) +- [`examples/udf/simple_udfw.rs`](examples/udf/simple_udwf.rs): Define and invoke a User Defined Window Function (UDWF) - [`sql_analysis.rs`](examples/sql_analysis.rs): Analyse SQL queries with DataFusion structures - [`sql_frontend.rs`](examples/sql_frontend.rs): Create LogicalPlans (only) from sql strings - [`sql_dialect.rs`](examples/sql_dialect.rs): Example of implementing a custom SQL dialect on top of `DFParser` - [`sql_query.rs`](examples/memtable.rs): Query data using SQL (in memory `RecordBatches`, local Parquet files) -- [`date_time_function.rs`](examples/date_time_function.rs): Examples of date-time related functions and queries. ## Distributed -- [`flight_client.rs`](examples/flight/flight_client.rs) and [`flight_server.rs`](examples/flight/flight_server.rs): Run DataFusion as a standalone process and execute SQL queries from a client using the Flight protocol. +- [`examples/flight/client.rs`](examples/flight/client.rs) and [`examples/flight/server.rs`](examples/flight/server.rs): Run DataFusion as a standalone process and execute SQL queries from a client using the Arrow Flight protocol. diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/advanced_parquet_index.rs index 55400e219283..67bfc5b1bcf5 100644 --- a/datafusion-examples/examples/advanced_parquet_index.rs +++ b/datafusion-examples/examples/advanced_parquet_index.rs @@ -121,7 +121,6 @@ use url::Url; /// │ ╚═══════════════════╝ │ 1. With cached ParquetMetadata, so /// └───────────────────────┘ the ParquetSource does not re-read / /// Parquet File decode the thrift footer -/// /// ``` /// /// Within a Row Group, Column Chunks store data in DataPages. This example also @@ -492,19 +491,18 @@ impl TableProvider for IndexTableProvider { .with_file(indexed_file); let file_source = Arc::new( - ParquetSource::default() + ParquetSource::new(schema.clone()) // provide the predicate so the DataSourceExec can try and prune // row groups internally .with_predicate(predicate) // provide the factory to create parquet reader without re-reading metadata .with_parquet_file_reader_factory(Arc::new(reader_factory)), ); - let file_scan_config = - FileScanConfigBuilder::new(object_store_url, schema, file_source) - .with_limit(limit) - .with_projection(projection.cloned()) - .with_file(partitioned_file) - .build(); + let file_scan_config = FileScanConfigBuilder::new(object_store_url, file_source) + .with_limit(limit) + .with_projection_indices(projection.cloned()) + .with_file(partitioned_file) + .build(); // Finally, put it all together into a DataSourceExec Ok(DataSourceExec::from_data_source(file_scan_config)) diff --git a/datafusion-examples/examples/date_time_functions.rs b/datafusion-examples/examples/builtin_functions/date_time.rs similarity index 97% rename from datafusion-examples/examples/date_time_functions.rs rename to datafusion-examples/examples/builtin_functions/date_time.rs index 2628319ae31f..178cba979cb9 100644 --- a/datafusion-examples/examples/date_time_functions.rs +++ b/datafusion-examples/examples/builtin_functions/date_time.rs @@ -26,8 +26,20 @@ use datafusion::common::assert_contains; use datafusion::error::Result; use datafusion::prelude::*; -#[tokio::main] -async fn main() -> Result<()> { +/// Example: Working with Date and Time Functions +/// +/// This example demonstrates how to work with various date and time +/// functions in DataFusion using both the DataFrame API and SQL queries. +/// +/// It includes: +/// - `make_date`: building `DATE` values from year, month, and day columns +/// - `to_date`: converting string expressions into `DATE` values +/// - `to_timestamp`: parsing strings or numeric values into `TIMESTAMP`s +/// - `to_char`: formatting dates, timestamps, and durations as strings +/// +/// Together, these examples show how to create, convert, and format temporal +/// data using DataFusion’s built-in functions. +pub async fn date_time() -> Result<()> { query_make_date().await?; query_to_date().await?; query_to_timestamp().await?; diff --git a/datafusion-examples/examples/function_factory.rs b/datafusion-examples/examples/builtin_functions/function_factory.rs similarity index 99% rename from datafusion-examples/examples/function_factory.rs rename to datafusion-examples/examples/builtin_functions/function_factory.rs index d4312ae59409..5d41e7a26071 100644 --- a/datafusion-examples/examples/function_factory.rs +++ b/datafusion-examples/examples/builtin_functions/function_factory.rs @@ -42,8 +42,7 @@ use std::sync::Arc; /// /// This example is rather simple and does not cover all cases required for a /// real implementation. -#[tokio::main] -async fn main() -> Result<()> { +pub async fn function_factory() -> Result<()> { // First we must configure the SessionContext with our function factory let ctx = SessionContext::new() // register custom function factory diff --git a/datafusion-examples/examples/builtin_functions/main.rs b/datafusion-examples/examples/builtin_functions/main.rs new file mode 100644 index 000000000000..c307bc9532bf --- /dev/null +++ b/datafusion-examples/examples/builtin_functions/main.rs @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! # These are miscellaneous function-related examples +//! +//! These examples demonstrate miscellaneous function-related features. +//! +//! ## Usage +//! ```bash +//! cargo run --example builtin_functions -- [date_time|function_factory|regexp] +//! ``` +//! +//! Each subcommand runs a corresponding example: +//! - `date_time` — examples of date-time related functions and queries +//! - `function_factory` — register `CREATE FUNCTION` handler to implement SQL macros +//! - `regexp` — examples of using regular expression functions + +mod date_time; +mod function_factory; +mod regexp; + +use std::str::FromStr; + +use datafusion::error::{DataFusionError, Result}; + +enum ExampleKind { + DateTime, + FunctionFactory, + Regexp, +} + +impl AsRef for ExampleKind { + fn as_ref(&self) -> &str { + match self { + Self::DateTime => "date_time", + Self::FunctionFactory => "function_factory", + Self::Regexp => "regexp", + } + } +} + +impl FromStr for ExampleKind { + type Err = DataFusionError; + + fn from_str(s: &str) -> Result { + match s { + "date_time" => Ok(Self::DateTime), + "function_factory" => Ok(Self::FunctionFactory), + "regexp" => Ok(Self::Regexp), + _ => Err(DataFusionError::Execution(format!("Unknown example: {s}"))), + } + } +} + +impl ExampleKind { + const ALL: [Self; 3] = [Self::DateTime, Self::FunctionFactory, Self::Regexp]; + + const EXAMPLE_NAME: &str = "builtin_functions"; + + fn variants() -> Vec<&'static str> { + Self::ALL.iter().map(|x| x.as_ref()).collect() + } +} + +#[tokio::main] +async fn main() -> Result<()> { + let usage = format!( + "Usage: cargo run --example {} -- [{}]", + ExampleKind::EXAMPLE_NAME, + ExampleKind::variants().join("|") + ); + + let arg = std::env::args().nth(1).ok_or_else(|| { + eprintln!("{usage}"); + DataFusionError::Execution("Missing argument".to_string()) + })?; + + match arg.parse::()? { + ExampleKind::DateTime => date_time::date_time().await?, + ExampleKind::FunctionFactory => function_factory::function_factory().await?, + ExampleKind::Regexp => regexp::regexp().await?, + } + + Ok(()) +} diff --git a/datafusion-examples/examples/regexp.rs b/datafusion-examples/examples/builtin_functions/regexp.rs similarity index 99% rename from datafusion-examples/examples/regexp.rs rename to datafusion-examples/examples/builtin_functions/regexp.rs index 12d115b9b502..13c078693028 100644 --- a/datafusion-examples/examples/regexp.rs +++ b/datafusion-examples/examples/builtin_functions/regexp.rs @@ -28,12 +28,11 @@ use datafusion::prelude::*; /// /// Supported flags can be found at /// https://docs.rs/regex/latest/regex/#grouping-and-flags -#[tokio::main] -async fn main() -> Result<()> { +pub async fn regexp() -> Result<()> { let ctx = SessionContext::new(); ctx.register_csv( "examples", - "../../datafusion/physical-expr/tests/data/regex.csv", + "datafusion/physical-expr/tests/data/regex.csv", CsvReadOptions::new(), ) .await?; diff --git a/datafusion-examples/examples/csv_json_opener.rs b/datafusion-examples/examples/custom_data_source/csv_json_opener.rs similarity index 89% rename from datafusion-examples/examples/csv_json_opener.rs rename to datafusion-examples/examples/custom_data_source/csv_json_opener.rs index 1a2c2cbff418..4205bbcdf86a 100644 --- a/datafusion-examples/examples/csv_json_opener.rs +++ b/datafusion-examples/examples/custom_data_source/csv_json_opener.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use arrow::datatypes::{DataType, Field, Schema}; +use datafusion::common::config::CsvOptions; use datafusion::{ assert_batches_eq, datasource::{ @@ -39,8 +40,7 @@ use object_store::{local::LocalFileSystem, memory::InMemory, ObjectStore}; /// read data from (CSV/JSON) into Arrow RecordBatches. /// /// If you want to query data in CSV or JSON files, see the [`dataframe.rs`] and [`sql_query.rs`] examples -#[tokio::main] -async fn main() -> Result<()> { +pub async fn csv_json_opener() -> Result<()> { csv_opener().await?; json_opener().await?; Ok(()) @@ -55,19 +55,25 @@ async fn csv_opener() -> Result<()> { let path = std::path::Path::new(&path).canonicalize()?; + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let scan_config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), - Arc::clone(&schema), - Arc::new(CsvSource::default()), + Arc::new(CsvSource::new(Arc::clone(&schema)).with_csv_options(options.clone())), ) - .with_projection(Some(vec![12, 0])) + .with_projection_indices(Some(vec![12, 0])) .with_limit(Some(5)) .with_file(PartitionedFile::new(path.display().to_string(), 10)) .build(); - let config = CsvSource::new(true, b',', b'"') + let config = CsvSource::new(Arc::clone(&schema)) + .with_csv_options(options) .with_comment(Some(b'#')) - .with_schema(schema) .with_batch_size(8192) .with_projection(&scan_config); @@ -123,10 +129,9 @@ async fn json_opener() -> Result<()> { let scan_config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), - schema, - Arc::new(JsonSource::default()), + Arc::new(JsonSource::new(schema)), ) - .with_projection(Some(vec![1, 0])) + .with_projection_indices(Some(vec![1, 0])) .with_limit(Some(5)) .with_file(PartitionedFile::new(path.to_string(), 10)) .build(); diff --git a/datafusion-examples/examples/csv_sql_streaming.rs b/datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs similarity index 98% rename from datafusion-examples/examples/csv_sql_streaming.rs rename to datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs index 99264bbcb486..aca63c4f35c2 100644 --- a/datafusion-examples/examples/csv_sql_streaming.rs +++ b/datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs @@ -21,8 +21,7 @@ use datafusion::prelude::*; /// This example demonstrates executing a simple query against an Arrow data source (CSV) and /// fetching results with streaming aggregation and streaming window -#[tokio::main] -async fn main() -> Result<()> { +pub async fn csv_sql_streaming() -> Result<()> { // create local execution context let ctx = SessionContext::new(); diff --git a/datafusion-examples/examples/custom_datasource.rs b/datafusion-examples/examples/custom_data_source/custom_datasource.rs similarity index 99% rename from datafusion-examples/examples/custom_datasource.rs rename to datafusion-examples/examples/custom_data_source/custom_datasource.rs index bc865fac5a33..2213d50fccda 100644 --- a/datafusion-examples/examples/custom_datasource.rs +++ b/datafusion-examples/examples/custom_data_source/custom_datasource.rs @@ -42,8 +42,7 @@ use datafusion::catalog::Session; use tokio::time::timeout; /// This example demonstrates executing a simple query against a custom datasource -#[tokio::main] -async fn main() -> Result<()> { +pub async fn custom_datasource() -> Result<()> { // create our custom datasource and adding some users let db = CustomDataSource::default(); db.populate_users(); diff --git a/datafusion-examples/examples/custom_file_casts.rs b/datafusion-examples/examples/custom_data_source/custom_file_casts.rs similarity index 99% rename from datafusion-examples/examples/custom_file_casts.rs rename to datafusion-examples/examples/custom_data_source/custom_file_casts.rs index 4d97ecd91dc6..31ec2845c611 100644 --- a/datafusion-examples/examples/custom_file_casts.rs +++ b/datafusion-examples/examples/custom_data_source/custom_file_casts.rs @@ -44,9 +44,7 @@ use object_store::{ObjectStore, PutPayload}; // This example enforces that casts must be strictly widening: if the file type is Int64 and the table type is Int32, it will error // before even reading the data. // Without this custom cast rule DataFusion would happily do the narrowing cast, potentially erroring only if it found a row with data it could not cast. - -#[tokio::main] -async fn main() -> Result<()> { +pub async fn custom_file_casts() -> Result<()> { println!("=== Creating example data ==="); // Create a logical / table schema with an Int32 column diff --git a/datafusion-examples/examples/custom_file_format.rs b/datafusion-examples/examples/custom_data_source/custom_file_format.rs similarity index 95% rename from datafusion-examples/examples/custom_file_format.rs rename to datafusion-examples/examples/custom_data_source/custom_file_format.rs index 67fe642fd46e..510fa53c593f 100644 --- a/datafusion-examples/examples/custom_file_format.rs +++ b/datafusion-examples/examples/custom_data_source/custom_file_format.rs @@ -30,6 +30,7 @@ use datafusion::{ FileFormat, FileFormatFactory, }, physical_plan::{FileScanConfig, FileSinkConfig, FileSource}, + table_schema::TableSchema, MemTable, }, error::Result, @@ -47,6 +48,42 @@ use tempfile::tempdir; /// TSVFileFormatFactory is responsible for creating instances of TSVFileFormat. /// The former, once registered with the SessionState, will then be used /// to facilitate SQL operations on TSV files, such as `COPY TO` shown here. +pub async fn custom_file_format() -> Result<()> { + // Create a new context with the default configuration + let mut state = SessionStateBuilder::new().with_default_features().build(); + + // Register the custom file format + let file_format = Arc::new(TSVFileFactory::new()); + state.register_file_format(file_format, true)?; + + // Create a new context with the custom file format + let ctx = SessionContext::new_with_state(state); + + let mem_table = create_mem_table(); + ctx.register_table("mem_table", mem_table)?; + + let temp_dir = tempdir().unwrap(); + let table_save_path = temp_dir.path().join("mem_table.tsv"); + + let d = ctx + .sql(&format!( + "COPY mem_table TO '{}' STORED AS TSV;", + table_save_path.display(), + )) + .await?; + + let results = d.collect().await?; + println!( + "Number of inserted rows: {:?}", + (results[0] + .column_by_name("count") + .unwrap() + .as_primitive::() + .value(0)) + ); + + Ok(()) +} #[derive(Debug)] /// Custom file format that reads and writes TSV files @@ -128,8 +165,8 @@ impl FileFormat for TSVFileFormat { .await } - fn file_source(&self) -> Arc { - self.csv_file_format.file_source() + fn file_source(&self, table_schema: TableSchema) -> Arc { + self.csv_file_format.file_source(table_schema) } } @@ -180,44 +217,6 @@ impl GetExt for TSVFileFactory { } } -#[tokio::main] -async fn main() -> Result<()> { - // Create a new context with the default configuration - let mut state = SessionStateBuilder::new().with_default_features().build(); - - // Register the custom file format - let file_format = Arc::new(TSVFileFactory::new()); - state.register_file_format(file_format, true).unwrap(); - - // Create a new context with the custom file format - let ctx = SessionContext::new_with_state(state); - - let mem_table = create_mem_table(); - ctx.register_table("mem_table", mem_table).unwrap(); - - let temp_dir = tempdir().unwrap(); - let table_save_path = temp_dir.path().join("mem_table.tsv"); - - let d = ctx - .sql(&format!( - "COPY mem_table TO '{}' STORED AS TSV;", - table_save_path.display(), - )) - .await?; - - let results = d.collect().await?; - println!( - "Number of inserted rows: {:?}", - (results[0] - .column_by_name("count") - .unwrap() - .as_primitive::() - .value(0)) - ); - - Ok(()) -} - // create a simple mem table fn create_mem_table() -> Arc { let fields = vec![ diff --git a/datafusion-examples/examples/file_stream_provider.rs b/datafusion-examples/examples/custom_data_source/file_stream_provider.rs similarity index 91% rename from datafusion-examples/examples/file_stream_provider.rs rename to datafusion-examples/examples/custom_data_source/file_stream_provider.rs index e6c59d57e98d..55d2cc8cc0af 100644 --- a/datafusion-examples/examples/file_stream_provider.rs +++ b/datafusion-examples/examples/custom_data_source/file_stream_provider.rs @@ -15,6 +15,29 @@ // specific language governing permissions and limitations // under the License. +/// Demonstrates how to use [`FileStreamProvider`] and [`StreamTable`] to stream data +/// from a file-like source (FIFO) into DataFusion for continuous querying. +/// +/// On non-Windows systems, this example creates a named pipe (FIFO) and +/// writes rows into it asynchronously while DataFusion reads the data +/// through a `FileStreamProvider`. +/// +/// This illustrates how to integrate dynamically updated data sources +/// with DataFusion without needing to reload the entire dataset each time. +/// +/// This example does not work on Windows. +pub async fn file_stream_provider() -> datafusion::error::Result<()> { + #[cfg(target_os = "windows")] + { + println!("file_stream_provider example does not work on windows"); + Ok(()) + } + #[cfg(not(target_os = "windows"))] + { + non_windows::main().await + } +} + #[cfg(not(target_os = "windows"))] mod non_windows { use datafusion::assert_batches_eq; @@ -186,16 +209,3 @@ mod non_windows { Ok(()) } } - -#[tokio::main] -async fn main() -> datafusion::error::Result<()> { - #[cfg(target_os = "windows")] - { - println!("file_stream_provider example does not work on windows"); - Ok(()) - } - #[cfg(not(target_os = "windows"))] - { - non_windows::main().await - } -} diff --git a/datafusion-examples/examples/custom_data_source/main.rs b/datafusion-examples/examples/custom_data_source/main.rs new file mode 100644 index 000000000000..ce0585f8c3f7 --- /dev/null +++ b/datafusion-examples/examples/custom_data_source/main.rs @@ -0,0 +1,126 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! # These examples are all related to extending or defining how DataFusion reads data +//! +//! These examples demonstrate how DataFusion reads data. +//! +//! ## Usage +//! ```bash +//! cargo run --example custom_data_source -- [csv_json_opener|csv_sql_streaming|custom_datasource|custom_file_casts|custom_file_format|file_stream_provider] +//! ``` +//! +//! Each subcommand runs a corresponding example: +//! - `csv_json_opener` — use low level FileOpener APIs to read CSV/JSON into Arrow RecordBatches +//! - `csv_sql_streaming` — build and run a streaming query plan from a SQL statement against a local CSV file +//! - `custom_datasource` — run queries against a custom datasource (TableProvider) +//! - `custom_file_casts` — implement custom casting rules to adapt file schemas +//! - `custom_file_format` — write data to a custom file format +//! - `file_stream_provider` — run a query on FileStreamProvider which implements StreamProvider for reading and writing to arbitrary stream sources/sinks + +mod csv_json_opener; +mod csv_sql_streaming; +mod custom_datasource; +mod custom_file_casts; +mod custom_file_format; +mod file_stream_provider; + +use std::str::FromStr; + +use datafusion::error::{DataFusionError, Result}; + +enum ExampleKind { + CsvJsonOpener, + CsvSqlStreaming, + CustomDatasource, + CustomFileCasts, + CustomFileFormat, + FileFtreamProvider, +} + +impl AsRef for ExampleKind { + fn as_ref(&self) -> &str { + match self { + Self::CsvJsonOpener => "csv_json_opener", + Self::CsvSqlStreaming => "csv_sql_streaming", + Self::CustomDatasource => "custom_datasource", + Self::CustomFileCasts => "custom_file_casts", + Self::CustomFileFormat => "custom_file_format", + Self::FileFtreamProvider => "file_stream_provider", + } + } +} + +impl FromStr for ExampleKind { + type Err = DataFusionError; + + fn from_str(s: &str) -> Result { + match s { + "csv_json_opener" => Ok(Self::CsvJsonOpener), + "csv_sql_streaming" => Ok(Self::CsvSqlStreaming), + "custom_datasource" => Ok(Self::CustomDatasource), + "custom_file_casts" => Ok(Self::CustomFileCasts), + "custom_file_format" => Ok(Self::CustomFileFormat), + "file_stream_provider" => Ok(Self::FileFtreamProvider), + _ => Err(DataFusionError::Execution(format!("Unknown example: {s}"))), + } + } +} + +impl ExampleKind { + const ALL: [Self; 6] = [ + Self::CsvJsonOpener, + Self::CsvSqlStreaming, + Self::CustomDatasource, + Self::CustomFileCasts, + Self::CustomFileFormat, + Self::FileFtreamProvider, + ]; + + const EXAMPLE_NAME: &str = "custom_data_source"; + + fn variants() -> Vec<&'static str> { + Self::ALL.iter().map(|x| x.as_ref()).collect() + } +} + +#[tokio::main] +async fn main() -> Result<()> { + let usage = format!( + "Usage: cargo run --example {} -- [{}]", + ExampleKind::EXAMPLE_NAME, + ExampleKind::variants().join("|") + ); + + let arg = std::env::args().nth(1).ok_or_else(|| { + eprintln!("{usage}"); + DataFusionError::Execution("Missing argument".to_string()) + })?; + + match arg.parse::()? { + ExampleKind::CsvJsonOpener => csv_json_opener::csv_json_opener().await?, + ExampleKind::CsvSqlStreaming => csv_sql_streaming::csv_sql_streaming().await?, + ExampleKind::CustomDatasource => custom_datasource::custom_datasource().await?, + ExampleKind::CustomFileCasts => custom_file_casts::custom_file_casts().await?, + ExampleKind::CustomFileFormat => custom_file_format::custom_file_format().await?, + ExampleKind::FileFtreamProvider => { + file_stream_provider::file_stream_provider().await? + } + } + + Ok(()) +} diff --git a/datafusion-examples/examples/default_column_values.rs b/datafusion-examples/examples/default_column_values.rs index 43e2d4ca0988..bfc60519f26e 100644 --- a/datafusion-examples/examples/default_column_values.rs +++ b/datafusion-examples/examples/default_column_values.rs @@ -235,7 +235,7 @@ impl TableProvider for DefaultValueTableProvider { &df_schema, )?; - let parquet_source = ParquetSource::default() + let parquet_source = ParquetSource::new(schema.clone()) .with_predicate(filter) .with_pushdown_filters(true); @@ -257,10 +257,9 @@ impl TableProvider for DefaultValueTableProvider { let file_scan_config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("memory://")?, - self.schema.clone(), Arc::new(parquet_source), ) - .with_projection(projection.cloned()) + .with_projection_indices(projection.cloned()) .with_limit(limit) .with_file_group(file_group) .with_expr_adapter(Some(Arc::new(DefaultValuePhysicalExprAdapterFactory) as _)); diff --git a/datafusion-examples/examples/external_dependency/query-aws-s3.rs b/datafusion-examples/examples/external_dependency/query-aws-s3.rs index da2d7e4879f9..cd0b4562d5f2 100644 --- a/datafusion-examples/examples/external_dependency/query-aws-s3.rs +++ b/datafusion-examples/examples/external_dependency/query-aws-s3.rs @@ -28,7 +28,6 @@ use url::Url; /// /// - AWS_ACCESS_KEY_ID /// - AWS_SECRET_ACCESS_KEY -/// #[tokio::main] async fn main() -> Result<()> { let ctx = SessionContext::new(); diff --git a/datafusion-examples/examples/flight/flight_client.rs b/datafusion-examples/examples/flight/client.rs similarity index 91% rename from datafusion-examples/examples/flight/flight_client.rs rename to datafusion-examples/examples/flight/client.rs index e3237284b430..031beea47d57 100644 --- a/datafusion-examples/examples/flight/flight_client.rs +++ b/datafusion-examples/examples/flight/client.rs @@ -17,6 +17,7 @@ use std::collections::HashMap; use std::sync::Arc; +use tonic::transport::Endpoint; use datafusion::arrow::datatypes::Schema; @@ -29,12 +30,13 @@ use datafusion::arrow::util::pretty; /// This example shows how to wrap DataFusion with `FlightService` to support looking up schema information for /// Parquet files and executing SQL queries against them on a remote server. /// This example is run along-side the example `flight_server`. -#[tokio::main] -async fn main() -> Result<(), Box> { +pub async fn client() -> Result<(), Box> { let testdata = datafusion::test_util::parquet_test_data(); // Create Flight client - let mut client = FlightServiceClient::connect("http://localhost:50051").await?; + let endpoint = Endpoint::new("http://localhost:50051")?; + let channel = endpoint.connect().await?; + let mut client = FlightServiceClient::new(channel); // Call get_schema to get the schema of a Parquet file let request = tonic::Request::new(FlightDescriptor { diff --git a/datafusion-examples/examples/flight/main.rs b/datafusion-examples/examples/flight/main.rs new file mode 100644 index 000000000000..a83b19bac42e --- /dev/null +++ b/datafusion-examples/examples/flight/main.rs @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! # Arrow Flight Examples +//! +//! These examples demonstrate Arrow Flight usage. +//! +//! ## Usage +//! ```bash +//! cargo run --example flight -- [client|server|sql_server] +//! ``` +//! +//! Each subcommand runs a corresponding example: +//! - `client` — run DataFusion as a standalone process and execute SQL queries from a client using the Flight protocol +//! - `server` — run DataFusion as a standalone process and execute SQL queries from a client using the Flight protocol +//! - `sql_server` — run DataFusion as a standalone process and execute SQL queries from JDBC clients + +mod client; +mod server; +mod sql_server; + +use std::str::FromStr; + +use datafusion::error::{DataFusionError, Result}; + +enum ExampleKind { + Client, + Server, + SqlServer, +} + +impl AsRef for ExampleKind { + fn as_ref(&self) -> &str { + match self { + Self::Client => "client", + Self::Server => "server", + Self::SqlServer => "sql_server", + } + } +} + +impl FromStr for ExampleKind { + type Err = DataFusionError; + + fn from_str(s: &str) -> Result { + match s { + "client" => Ok(Self::Client), + "server" => Ok(Self::Server), + "sql_server" => Ok(Self::SqlServer), + _ => Err(DataFusionError::Execution(format!("Unknown example: {s}"))), + } + } +} + +impl ExampleKind { + const ALL: [Self; 3] = [Self::Client, Self::Server, Self::SqlServer]; + + const EXAMPLE_NAME: &str = "flight"; + + fn variants() -> Vec<&'static str> { + Self::ALL.iter().map(|x| x.as_ref()).collect() + } +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let usage = format!( + "Usage: cargo run --example {} -- [{}]", + ExampleKind::EXAMPLE_NAME, + ExampleKind::variants().join("|") + ); + + let arg = std::env::args().nth(1).ok_or_else(|| { + eprintln!("{usage}"); + DataFusionError::Execution("Missing argument".to_string()) + })?; + + match arg.parse::()? { + ExampleKind::Client => client::client().await?, + ExampleKind::Server => server::server().await?, + ExampleKind::SqlServer => sql_server::sql_server().await?, + } + + Ok(()) +} diff --git a/datafusion-examples/examples/flight/flight_server.rs b/datafusion-examples/examples/flight/server.rs similarity index 95% rename from datafusion-examples/examples/flight/flight_server.rs rename to datafusion-examples/examples/flight/server.rs index 58bfb7a341c1..dc75287cf2e2 100644 --- a/datafusion-examples/examples/flight/flight_server.rs +++ b/datafusion-examples/examples/flight/server.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::ipc::writer::{DictionaryTracker, IpcDataGenerator}; +use arrow::ipc::writer::{CompressionContext, DictionaryTracker, IpcDataGenerator}; use std::sync::Arc; use arrow_flight::{PollInfo, SchemaAsIpc}; @@ -106,6 +106,7 @@ impl FlightService for FlightServiceImpl { // add an initial FlightData message that sends schema let options = arrow::ipc::writer::IpcWriteOptions::default(); + let mut compression_context = CompressionContext::default(); let schema_flight_data = SchemaAsIpc::new(&schema, &options); let mut flights = vec![FlightData::from(schema_flight_data)]; @@ -115,7 +116,7 @@ impl FlightService for FlightServiceImpl { for batch in &results { let (flight_dictionaries, flight_batch) = encoder - .encoded_batch(batch, &mut tracker, &options) + .encode(batch, &mut tracker, &options, &mut compression_context) .map_err(|e: ArrowError| Status::internal(e.to_string()))?; flights.extend(flight_dictionaries.into_iter().map(Into::into)); @@ -193,8 +194,7 @@ fn to_tonic_err(e: datafusion::error::DataFusionError) -> Status { /// This example shows how to wrap DataFusion with `FlightService` to support looking up schema information for /// Parquet files and executing SQL queries against them on a remote server. /// This example is run along-side the example `flight_client`. -#[tokio::main] -async fn main() -> Result<(), Box> { +pub async fn server() -> Result<(), Box> { let addr = "0.0.0.0:50051".parse()?; let service = FlightServiceImpl {}; diff --git a/datafusion-examples/examples/flight/flight_sql_server.rs b/datafusion-examples/examples/flight/sql_server.rs similarity index 99% rename from datafusion-examples/examples/flight/flight_sql_server.rs rename to datafusion-examples/examples/flight/sql_server.rs index c35debec7d71..d86860f9d436 100644 --- a/datafusion-examples/examples/flight/flight_sql_server.rs +++ b/datafusion-examples/examples/flight/sql_server.rs @@ -68,9 +68,7 @@ macro_rules! status { /// /// Based heavily on Ballista's implementation: https://github.com/apache/datafusion-ballista/blob/main/ballista/scheduler/src/flight_sql.rs /// and the example in arrow-rs: https://github.com/apache/arrow-rs/blob/master/arrow-flight/examples/flight_sql_server.rs -/// -#[tokio::main] -async fn main() -> Result<(), Box> { +pub async fn sql_server() -> Result<(), Box> { env_logger::init(); let addr = "0.0.0.0:50051".parse()?; let service = FlightSqlServiceImpl { diff --git a/datafusion-examples/examples/json_shredding.rs b/datafusion-examples/examples/json_shredding.rs index a2e83bc9510a..5ef8b59b6420 100644 --- a/datafusion-examples/examples/json_shredding.rs +++ b/datafusion-examples/examples/json_shredding.rs @@ -142,7 +142,7 @@ async fn main() -> Result<()> { .await?; let plan = format!("{}", arrow::util::pretty::pretty_format_batches(&batches)?); println!("{plan}"); - assert_contains!(&plan, "row_groups_pruned_statistics=1"); + assert_contains!(&plan, "row_groups_pruned_statistics=2 total → 1 matched"); assert_contains!(&plan, "pushdown_rows_pruned=1"); Ok(()) diff --git a/datafusion-examples/examples/parquet_embedded_index.rs b/datafusion-examples/examples/parquet_embedded_index.rs index 3cbe18914775..bc0e5a072caa 100644 --- a/datafusion-examples/examples/parquet_embedded_index.rs +++ b/datafusion-examples/examples/parquet_embedded_index.rs @@ -426,8 +426,10 @@ impl TableProvider for DistinctIndexTable { // Build ParquetSource to actually read the files let url = ObjectStoreUrl::parse("file://")?; - let source = Arc::new(ParquetSource::default().with_enable_page_index(true)); - let mut builder = FileScanConfigBuilder::new(url, self.schema.clone(), source); + let source = Arc::new( + ParquetSource::new(self.schema.clone()).with_enable_page_index(true), + ); + let mut builder = FileScanConfigBuilder::new(url, source); for file in files_to_scan { let path = self.dir.join(file); let len = std::fs::metadata(&path)?.len(); diff --git a/datafusion-examples/examples/parquet_encrypted.rs b/datafusion-examples/examples/parquet_encrypted.rs index e9e239b7a1c3..690d9f2a5f14 100644 --- a/datafusion-examples/examples/parquet_encrypted.rs +++ b/datafusion-examples/examples/parquet_encrypted.rs @@ -16,12 +16,13 @@ // under the License. use datafusion::common::DataFusionError; -use datafusion::config::TableParquetOptions; +use datafusion::config::{ConfigFileEncryptionProperties, TableParquetOptions}; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::logical_expr::{col, lit}; use datafusion::parquet::encryption::decrypt::FileDecryptionProperties; use datafusion::parquet::encryption::encrypt::FileEncryptionProperties; use datafusion::prelude::{ParquetReadOptions, SessionContext}; +use std::sync::Arc; use tempfile::TempDir; #[tokio::main] @@ -55,7 +56,7 @@ async fn main() -> datafusion::common::Result<()> { // Write encrypted parquet let mut options = TableParquetOptions::default(); - options.crypto.file_encryption = Some((&encrypt).into()); + options.crypto.file_encryption = Some(ConfigFileEncryptionProperties::from(&encrypt)); parquet_df .write_parquet( tempfile_str.as_str(), @@ -100,7 +101,8 @@ async fn query_dataframe(df: &DataFrame) -> Result<(), DataFusionError> { // Setup encryption and decryption properties fn setup_encryption( parquet_df: &DataFrame, -) -> Result<(FileEncryptionProperties, FileDecryptionProperties), DataFusionError> { +) -> Result<(Arc, Arc), DataFusionError> +{ let schema = parquet_df.schema(); let footer_key = b"0123456789012345".to_vec(); // 128bit/16 let column_key = b"1234567890123450".to_vec(); // 128bit/16 diff --git a/datafusion-examples/examples/parquet_encrypted_with_kms.rs b/datafusion-examples/examples/parquet_encrypted_with_kms.rs index 19b0e8d0b199..45bfd183773a 100644 --- a/datafusion-examples/examples/parquet_encrypted_with_kms.rs +++ b/datafusion-examples/examples/parquet_encrypted_with_kms.rs @@ -226,7 +226,7 @@ impl EncryptionFactory for TestEncryptionFactory { options: &EncryptionFactoryOptions, schema: &SchemaRef, _file_path: &Path, - ) -> Result> { + ) -> Result>> { let config: EncryptionConfig = options.to_extension_options()?; // Generate a random encryption key for this file. @@ -268,7 +268,7 @@ impl EncryptionFactory for TestEncryptionFactory { &self, _options: &EncryptionFactoryOptions, _file_path: &Path, - ) -> Result> { + ) -> Result>> { let decryption_properties = FileDecryptionProperties::with_key_retriever(Arc::new(TestKeyRetriever {})) .build()?; diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/parquet_index.rs index afc3b279f4a9..bc9e2a9226d0 100644 --- a/datafusion-examples/examples/parquet_index.rs +++ b/datafusion-examples/examples/parquet_index.rs @@ -99,7 +99,6 @@ use url::Url; /// Thus some parquet files are │ │ /// "pruned" and thus are not └─────────────┘ /// scanned at all Parquet Files -/// /// ``` /// /// [`ListingTable`]: datafusion::datasource::listing::ListingTable @@ -243,10 +242,11 @@ impl TableProvider for IndexTableProvider { let files = self.index.get_files(predicate.clone())?; let object_store_url = ObjectStoreUrl::parse("file://")?; - let source = Arc::new(ParquetSource::default().with_predicate(predicate)); + let source = + Arc::new(ParquetSource::new(self.schema()).with_predicate(predicate)); let mut file_scan_config_builder = - FileScanConfigBuilder::new(object_store_url, self.schema(), source) - .with_projection(projection.cloned()) + FileScanConfigBuilder::new(object_store_url, source) + .with_projection_indices(projection.cloned()) .with_limit(limit); // Transform to the format needed to pass to DataSourceExec diff --git a/datafusion-examples/examples/sql_query.rs b/datafusion-examples/examples/sql_query.rs index 0ac203cfb7e7..4da07d33d03d 100644 --- a/datafusion-examples/examples/sql_query.rs +++ b/datafusion-examples/examples/sql_query.rs @@ -32,7 +32,6 @@ use std::sync::Arc; /// /// [`query_memtable`]: a simple query against a [`MemTable`] /// [`query_parquet`]: a simple query against a directory with multiple Parquet files -/// #[tokio::main] async fn main() -> Result<()> { query_memtable().await?; diff --git a/datafusion-examples/examples/thread_pools.rs b/datafusion-examples/examples/thread_pools.rs index bba56b2932ab..9842cccfbfe8 100644 --- a/datafusion-examples/examples/thread_pools.rs +++ b/datafusion-examples/examples/thread_pools.rs @@ -342,7 +342,7 @@ impl CpuRuntime { /// message such as: /// /// ```text - ///A Tokio 1.x context was found, but IO is disabled. + /// A Tokio 1.x context was found, but IO is disabled. /// ``` pub fn handle(&self) -> &Handle { &self.handle diff --git a/datafusion-examples/examples/advanced_udaf.rs b/datafusion-examples/examples/udf/advanced_udaf.rs similarity index 98% rename from datafusion-examples/examples/advanced_udaf.rs rename to datafusion-examples/examples/udf/advanced_udaf.rs index 89f0a470e32e..81e227bfacee 100644 --- a/datafusion-examples/examples/advanced_udaf.rs +++ b/datafusion-examples/examples/udf/advanced_udaf.rs @@ -469,8 +469,9 @@ fn create_context() -> Result { Ok(ctx) } -#[tokio::main] -async fn main() -> Result<()> { +/// In this example we register `GeoMeanUdaf` and `SimplifiedGeoMeanUdaf` +/// as user defined aggregate functions and invoke them via the DataFrame API and SQL +pub async fn advanced_udaf() -> Result<()> { let ctx = create_context()?; let geo_mean_udf = AggregateUDF::from(GeoMeanUdaf::new()); diff --git a/datafusion-examples/examples/advanced_udf.rs b/datafusion-examples/examples/udf/advanced_udf.rs similarity index 99% rename from datafusion-examples/examples/advanced_udf.rs rename to datafusion-examples/examples/udf/advanced_udf.rs index 56ae599efa11..bb5a68e90cbb 100644 --- a/datafusion-examples/examples/advanced_udf.rs +++ b/datafusion-examples/examples/udf/advanced_udf.rs @@ -245,10 +245,35 @@ fn maybe_pow_in_place(base: f64, exp_array: ArrayRef) -> Result { } } +/// create local execution context with an in-memory table: +/// +/// ```text +/// +-----+-----+ +/// | a | b | +/// +-----+-----+ +/// | 2.1 | 1.0 | +/// | 3.1 | 2.0 | +/// | 4.1 | 3.0 | +/// | 5.1 | 4.0 | +/// +-----+-----+ +/// ``` +fn create_context() -> Result { + // define data. + let a: ArrayRef = Arc::new(Float32Array::from(vec![2.1, 3.1, 4.1, 5.1])); + let b: ArrayRef = Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])); + let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)])?; + + // declare a new context. In Spark API, this corresponds to a new SparkSession + let ctx = SessionContext::new(); + + // declare a table in memory. In Spark API, this corresponds to createDataFrame(...). + ctx.register_batch("t", batch)?; + Ok(ctx) +} + /// In this example we register `PowUdf` as a user defined function /// and invoke it via the DataFrame API and SQL -#[tokio::main] -async fn main() -> Result<()> { +pub async fn advanced_udf() -> Result<()> { let ctx = create_context()?; // create the UDF @@ -295,29 +320,3 @@ async fn main() -> Result<()> { Ok(()) } - -/// create local execution context with an in-memory table: -/// -/// ```text -/// +-----+-----+ -/// | a | b | -/// +-----+-----+ -/// | 2.1 | 1.0 | -/// | 3.1 | 2.0 | -/// | 4.1 | 3.0 | -/// | 5.1 | 4.0 | -/// +-----+-----+ -/// ``` -fn create_context() -> Result { - // define data. - let a: ArrayRef = Arc::new(Float32Array::from(vec![2.1, 3.1, 4.1, 5.1])); - let b: ArrayRef = Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])); - let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)])?; - - // declare a new context. In Spark API, this corresponds to a new SparkSession - let ctx = SessionContext::new(); - - // declare a table in memory. In Spark API, this corresponds to createDataFrame(...). - ctx.register_batch("t", batch)?; - Ok(ctx) -} diff --git a/datafusion-examples/examples/advanced_udwf.rs b/datafusion-examples/examples/udf/advanced_udwf.rs similarity index 98% rename from datafusion-examples/examples/advanced_udwf.rs rename to datafusion-examples/examples/udf/advanced_udwf.rs index ba4c377fd676..86f215e019c7 100644 --- a/datafusion-examples/examples/advanced_udwf.rs +++ b/datafusion-examples/examples/udf/advanced_udwf.rs @@ -236,8 +236,9 @@ async fn create_context() -> Result { Ok(ctx) } -#[tokio::main] -async fn main() -> Result<()> { +/// In this example we register `SmoothItUdf` as user defined window function +/// and invoke it via the DataFrame API and SQL +pub async fn advanced_udwf() -> Result<()> { let ctx = create_context().await?; let smooth_it = WindowUDF::from(SmoothItUdf::new()); ctx.register_udwf(smooth_it.clone()); diff --git a/datafusion-examples/examples/async_udf.rs b/datafusion-examples/examples/udf/async_udf.rs similarity index 98% rename from datafusion-examples/examples/async_udf.rs rename to datafusion-examples/examples/udf/async_udf.rs index b52ec68ea442..475775a599f6 100644 --- a/datafusion-examples/examples/async_udf.rs +++ b/datafusion-examples/examples/udf/async_udf.rs @@ -38,8 +38,9 @@ use datafusion::prelude::{SessionConfig, SessionContext}; use std::any::Any; use std::sync::Arc; -#[tokio::main] -async fn main() -> Result<()> { +/// In this example we register `AskLLM` as an asynchronous user defined function +/// and invoke it via the DataFrame API and SQL +pub async fn async_udf() -> Result<()> { // Use a hard coded parallelism level of 4 so the explain plan // is consistent across machines. let config = SessionConfig::new().with_target_partitions(4); diff --git a/datafusion-examples/examples/udf/main.rs b/datafusion-examples/examples/udf/main.rs new file mode 100644 index 000000000000..104d37393780 --- /dev/null +++ b/datafusion-examples/examples/udf/main.rs @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! # User-Defined Functions Examples +//! +//! These examples demonstrate user-defined functions in DataFusion. +//! +//! ## Usage +//! ```bash +//! cargo run --example udf -- [adv_udaf|adv_udf|adv_udwf|async_udf|udaf|udf|udtf|udwf] +//! ``` +//! +//! Each subcommand runs a corresponding example: +//! - `adv_udaf` — user defined aggregate function example +//! - `adv_udf` — user defined scalar function example +//! - `adv_udwf` — user defined window function example +//! - `async_udf` — asynchronous user defined function example +//! - `udaf` — simple user defined aggregate function example +//! - `udf` — simple user defined scalar function example +//! - `udtf` — simple user defined table function example +//! - `udwf` — simple user defined window function example + +mod advanced_udaf; +mod advanced_udf; +mod advanced_udwf; +mod async_udf; +mod simple_udaf; +mod simple_udf; +mod simple_udtf; +mod simple_udwf; + +use std::str::FromStr; + +use datafusion::error::{DataFusionError, Result}; + +enum ExampleKind { + AdvUdaf, + AdvUdf, + AdvUdwf, + AsyncUdf, + Udf, + Udaf, + Udwf, + Udtf, +} + +impl AsRef for ExampleKind { + fn as_ref(&self) -> &str { + match self { + Self::AdvUdaf => "adv_udaf", + Self::AdvUdf => "adv_udf", + Self::AdvUdwf => "adv_udwf", + Self::AsyncUdf => "async_udf", + Self::Udf => "udf", + Self::Udaf => "udaf", + Self::Udwf => "udwt", + Self::Udtf => "udtf", + } + } +} + +impl FromStr for ExampleKind { + type Err = DataFusionError; + + fn from_str(s: &str) -> Result { + match s { + "adv_udaf" => Ok(Self::AdvUdaf), + "adv_udf" => Ok(Self::AdvUdf), + "adv_udwf" => Ok(Self::AdvUdwf), + "async_udf" => Ok(Self::AsyncUdf), + "udaf" => Ok(Self::Udaf), + "udf" => Ok(Self::Udf), + "udtf" => Ok(Self::Udtf), + "udwf" => Ok(Self::Udwf), + _ => Err(DataFusionError::Execution(format!("Unknown example: {s}"))), + } + } +} + +impl ExampleKind { + const ALL: [Self; 8] = [ + Self::AdvUdaf, + Self::AdvUdf, + Self::AdvUdwf, + Self::AsyncUdf, + Self::Udaf, + Self::Udf, + Self::Udtf, + Self::Udwf, + ]; + + const EXAMPLE_NAME: &str = "udf"; + + fn variants() -> Vec<&'static str> { + Self::ALL.iter().map(|x| x.as_ref()).collect() + } +} + +#[tokio::main] +async fn main() -> Result<()> { + let usage = format!( + "Usage: cargo run --example {} -- [{}]", + ExampleKind::EXAMPLE_NAME, + ExampleKind::variants().join("|") + ); + + let arg = std::env::args().nth(1).ok_or_else(|| { + eprintln!("{usage}"); + DataFusionError::Execution("Missing argument".to_string()) + })?; + + match arg.parse::()? { + ExampleKind::AdvUdaf => advanced_udaf::advanced_udaf().await?, + ExampleKind::AdvUdf => advanced_udf::advanced_udf().await?, + ExampleKind::AdvUdwf => advanced_udwf::advanced_udwf().await?, + ExampleKind::AsyncUdf => async_udf::async_udf().await?, + ExampleKind::Udaf => simple_udaf::simple_udaf().await?, + ExampleKind::Udf => simple_udf::simple_udf().await?, + ExampleKind::Udtf => simple_udtf::simple_udtf().await?, + ExampleKind::Udwf => simple_udwf::simple_udwf().await?, + } + + Ok(()) +} diff --git a/datafusion-examples/examples/simple_udaf.rs b/datafusion-examples/examples/udf/simple_udaf.rs similarity index 97% rename from datafusion-examples/examples/simple_udaf.rs rename to datafusion-examples/examples/udf/simple_udaf.rs index 82bde7c034a5..e9f905e72099 100644 --- a/datafusion-examples/examples/simple_udaf.rs +++ b/datafusion-examples/examples/udf/simple_udaf.rs @@ -135,8 +135,9 @@ impl Accumulator for GeometricMean { } } -#[tokio::main] -async fn main() -> Result<()> { +/// In this example we register `GeometricMean` +/// as user defined aggregate function and invoke it via the DataFrame API and SQL +pub async fn simple_udaf() -> Result<()> { let ctx = create_context()?; // here is where we define the UDAF. We also declare its signature: diff --git a/datafusion-examples/examples/simple_udf.rs b/datafusion-examples/examples/udf/simple_udf.rs similarity index 99% rename from datafusion-examples/examples/simple_udf.rs rename to datafusion-examples/examples/udf/simple_udf.rs index 5612e0939f70..7d4f3588e313 100644 --- a/datafusion-examples/examples/simple_udf.rs +++ b/datafusion-examples/examples/udf/simple_udf.rs @@ -57,8 +57,7 @@ fn create_context() -> Result { } /// In this example we will declare a single-type, single return type UDF that exponentiates f64, a^b -#[tokio::main] -async fn main() -> Result<()> { +pub async fn simple_udf() -> Result<()> { let ctx = create_context()?; // First, declare the actual implementation of the calculation diff --git a/datafusion-examples/examples/simple_udtf.rs b/datafusion-examples/examples/udf/simple_udtf.rs similarity index 99% rename from datafusion-examples/examples/simple_udtf.rs rename to datafusion-examples/examples/udf/simple_udtf.rs index b65ffb8d7174..a03b157134ae 100644 --- a/datafusion-examples/examples/simple_udtf.rs +++ b/datafusion-examples/examples/udf/simple_udtf.rs @@ -42,8 +42,7 @@ use std::sync::Arc; // 3. Register the function using [`SessionContext::register_udtf`] /// This example demonstrates how to register a TableFunction -#[tokio::main] -async fn main() -> Result<()> { +pub async fn simple_udtf() -> Result<()> { // create local execution context let ctx = SessionContext::new(); diff --git a/datafusion-examples/examples/simple_udwf.rs b/datafusion-examples/examples/udf/simple_udwf.rs similarity index 99% rename from datafusion-examples/examples/simple_udwf.rs rename to datafusion-examples/examples/udf/simple_udwf.rs index 1736ff00bd70..2cf1df8d8ed8 100644 --- a/datafusion-examples/examples/simple_udwf.rs +++ b/datafusion-examples/examples/udf/simple_udwf.rs @@ -42,8 +42,7 @@ async fn create_context() -> Result { } /// In this example we will declare a user defined window function that computes a moving average and then run it using SQL -#[tokio::main] -async fn main() -> Result<()> { +pub async fn simple_udwf() -> Result<()> { let ctx = create_context().await?; // here is where we define the UDWF. We also declare its signature: diff --git a/datafusion/catalog-listing/Cargo.toml b/datafusion/catalog-listing/Cargo.toml index 4eaeed675a20..be1374b37148 100644 --- a/datafusion/catalog-listing/Cargo.toml +++ b/datafusion/catalog-listing/Cargo.toml @@ -46,11 +46,13 @@ futures = { workspace = true } itertools = { workspace = true } log = { workspace = true } object_store = { workspace = true } -tokio = { workspace = true } [dev-dependencies] datafusion-datasource-parquet = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/catalog-listing/src/config.rs b/datafusion/catalog-listing/src/config.rs index 90f44de4fdbc..3370d2ea7553 100644 --- a/datafusion/catalog-listing/src/config.rs +++ b/datafusion/catalog-listing/src/config.rs @@ -53,7 +53,6 @@ pub enum SchemaSource { /// /// If not specified, a [`datafusion_datasource::schema_adapter::DefaultSchemaAdapterFactory`] /// will be used, which handles basic schema compatibility cases. -/// #[derive(Debug, Clone, Default)] pub struct ListingTableConfig { /// Paths on the `ObjectStore` for creating [`crate::ListingTable`]. @@ -160,8 +159,8 @@ impl ListingTableConfig { /// .with_file_extension(".parquet") /// .with_collect_stat(true); /// - /// let config = ListingTableConfig::new(table_paths) - /// .with_listing_options(options); // Configure file format and options + /// let config = ListingTableConfig::new(table_paths).with_listing_options(options); + /// // Configure file format and options /// ``` pub fn with_listing_options(self, listing_options: ListingOptions) -> Self { // Note: This method properly sets options, but be aware that downstream diff --git a/datafusion/catalog-listing/src/helpers.rs b/datafusion/catalog-listing/src/helpers.rs index 82cc36867939..089457648d21 100644 --- a/datafusion/catalog-listing/src/helpers.rs +++ b/datafusion/catalog-listing/src/helpers.rs @@ -25,12 +25,11 @@ use datafusion_common::internal_err; use datafusion_common::{HashMap, Result, ScalarValue}; use datafusion_datasource::ListingTableUrl; use datafusion_datasource::PartitionedFile; -use datafusion_expr::{BinaryExpr, Operator}; +use datafusion_expr::{lit, utils, BinaryExpr, Operator}; use arrow::{ - array::{Array, ArrayRef, AsArray, StringBuilder}, - compute::{and, cast, prep_null_mask_filter}, - datatypes::{DataType, Field, Fields, Schema}, + array::AsArray, + datatypes::{DataType, Field}, record_batch::RecordBatch, }; use datafusion_expr::execution_props::ExecutionProps; @@ -39,7 +38,7 @@ use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use log::{debug, trace}; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; -use datafusion_common::{Column, DFSchema, DataFusionError}; +use datafusion_common::{Column, DFSchema}; use datafusion_expr::{Expr, Volatility}; use datafusion_physical_expr::create_physical_expr; use object_store::path::Path; @@ -239,105 +238,6 @@ pub async fn list_partitions( Ok(out) } -async fn prune_partitions( - table_path: &ListingTableUrl, - partitions: Vec, - filters: &[Expr], - partition_cols: &[(String, DataType)], -) -> Result> { - if filters.is_empty() { - // prune partitions which don't contain the partition columns - return Ok(partitions - .into_iter() - .filter(|p| { - let cols = partition_cols.iter().map(|x| x.0.as_str()); - !parse_partitions_for_path(table_path, &p.path, cols) - .unwrap_or_default() - .is_empty() - }) - .collect()); - } - - let mut builders: Vec<_> = (0..partition_cols.len()) - .map(|_| StringBuilder::with_capacity(partitions.len(), partitions.len() * 10)) - .collect(); - - for partition in &partitions { - let cols = partition_cols.iter().map(|x| x.0.as_str()); - let parsed = parse_partitions_for_path(table_path, &partition.path, cols) - .unwrap_or_default(); - - let mut builders = builders.iter_mut(); - for (p, b) in parsed.iter().zip(&mut builders) { - b.append_value(p); - } - builders.for_each(|b| b.append_null()); - } - - let arrays = partition_cols - .iter() - .zip(builders) - .map(|((_, d), mut builder)| { - let array = builder.finish(); - cast(&array, d) - }) - .collect::>()?; - - let fields: Fields = partition_cols - .iter() - .map(|(n, d)| Field::new(n, d.clone(), true)) - .collect(); - let schema = Arc::new(Schema::new(fields)); - - let df_schema = DFSchema::from_unqualified_fields( - partition_cols - .iter() - .map(|(n, d)| Field::new(n, d.clone(), true)) - .collect(), - Default::default(), - )?; - - let batch = RecordBatch::try_new(schema, arrays)?; - - // TODO: Plumb this down - let props = ExecutionProps::new(); - - // Applies `filter` to `batch` returning `None` on error - let do_filter = |filter| -> Result { - let expr = create_physical_expr(filter, &df_schema, &props)?; - expr.evaluate(&batch)?.into_array(partitions.len()) - }; - - //.Compute the conjunction of the filters - let mask = filters - .iter() - .map(|f| do_filter(f).map(|a| a.as_boolean().clone())) - .reduce(|a, b| Ok(and(&a?, &b?)?)); - - let mask = match mask { - Some(Ok(mask)) => mask, - Some(Err(err)) => return Err(err), - None => return Ok(partitions), - }; - - // Don't retain partitions that evaluated to null - let prepared = match mask.null_count() { - 0 => mask, - _ => prep_null_mask_filter(&mask), - }; - - // Sanity check - assert_eq!(prepared.len(), partitions.len()); - - let filtered = partitions - .into_iter() - .zip(prepared.values()) - .filter_map(|(p, f)| f.then_some(p)) - .collect(); - - Ok(filtered) -} - #[derive(Debug)] enum PartitionValue { Single(String), @@ -412,6 +312,62 @@ pub fn evaluate_partition_prefix<'a>( } } +fn filter_partitions( + pf: PartitionedFile, + filters: &[Expr], + df_schema: &DFSchema, +) -> Result> { + if pf.partition_values.is_empty() && !filters.is_empty() { + return Ok(None); + } else if filters.is_empty() { + return Ok(Some(pf)); + } + + let arrays = pf + .partition_values + .iter() + .map(|v| v.to_array()) + .collect::>()?; + + let batch = RecordBatch::try_new(Arc::clone(df_schema.inner()), arrays)?; + + let filter = utils::conjunction(filters.iter().cloned()).unwrap_or_else(|| lit(true)); + let props = ExecutionProps::new(); + let expr = create_physical_expr(&filter, df_schema, &props)?; + + // Since we're only operating on a single file, our batch and resulting "array" holds only one + // value indicating if the input file matches the provided filters + let matches = expr.evaluate(&batch)?.into_array(1)?; + if matches.as_boolean().value(0) { + return Ok(Some(pf)); + } + + Ok(None) +} + +fn try_into_partitioned_file( + object_meta: ObjectMeta, + partition_cols: &[(String, DataType)], + table_path: &ListingTableUrl, +) -> Result { + let cols = partition_cols.iter().map(|(name, _)| name.as_str()); + let parsed = parse_partitions_for_path(table_path, &object_meta.location, cols); + + let partition_values = parsed + .into_iter() + .flatten() + .zip(partition_cols) + .map(|(parsed, (_, datatype))| { + ScalarValue::try_from_string(parsed.to_string(), datatype) + }) + .collect::>>()?; + + let mut pf: PartitionedFile = object_meta.into(); + pf.partition_values = partition_values; + + Ok(pf) +} + /// Discover the partitions on the given path and prune out files /// that belong to irrelevant partitions using `filters` expressions. /// `filters` should only contain expressions that can be evaluated @@ -424,7 +380,11 @@ pub async fn pruned_partition_list<'a>( file_extension: &'a str, partition_cols: &'a [(String, DataType)], ) -> Result>> { - // if no partition col => simply list all the files + let objects = table_path + .list_all_files(ctx, store, file_extension) + .await? + .try_filter(|object_meta| futures::future::ready(object_meta.size > 0)); + if partition_cols.is_empty() { if !filters.is_empty() { return internal_err!( @@ -432,72 +392,29 @@ pub async fn pruned_partition_list<'a>( table_path ); } - return Ok(Box::pin( - table_path - .list_all_files(ctx, store, file_extension) - .await? - .try_filter(|object_meta| futures::future::ready(object_meta.size > 0)) - .map_ok(|object_meta| object_meta.into()), - )); - } - - let partition_prefix = evaluate_partition_prefix(partition_cols, filters); - - let partitions = - list_partitions(store, table_path, partition_cols.len(), partition_prefix) - .await?; - debug!("Listed {} partitions", partitions.len()); - let pruned = - prune_partitions(table_path, partitions, filters, partition_cols).await?; - - debug!("Pruning yielded {} partitions", pruned.len()); - - let stream = futures::stream::iter(pruned) - .map(move |partition: Partition| async move { - let cols = partition_cols.iter().map(|x| x.0.as_str()); - let parsed = parse_partitions_for_path(table_path, &partition.path, cols); + // if no partition col => simply list all the files + Ok(objects.map_ok(|object_meta| object_meta.into()).boxed()) + } else { + let df_schema = DFSchema::from_unqualified_fields( + partition_cols + .iter() + .map(|(n, d)| Field::new(n, d.clone(), true)) + .collect(), + Default::default(), + )?; - let partition_values = parsed - .into_iter() - .flatten() - .zip(partition_cols) - .map(|(parsed, (_, datatype))| { - ScalarValue::try_from_string(parsed.to_string(), datatype) - }) - .collect::>>()?; - - let files = match partition.files { - Some(files) => files, - None => { - trace!("Recursively listing partition {}", partition.path); - store.list(Some(&partition.path)).try_collect().await? - } - }; - let files = files.into_iter().filter(move |o| { - let extension_match = o.location.as_ref().ends_with(file_extension); - // here need to scan subdirectories(`listing_table_ignore_subdirectory` = false) - let glob_match = table_path.contains(&o.location, false); - extension_match && glob_match - }); - - let stream = futures::stream::iter(files.map(move |object_meta| { - Ok(PartitionedFile { - object_meta, - partition_values: partition_values.clone(), - range: None, - statistics: None, - extensions: None, - metadata_size_hint: None, - }) - })); - - Ok::<_, DataFusionError>(stream) - }) - .buffer_unordered(CONCURRENCY_LIMIT) - .try_flatten() - .boxed(); - Ok(stream) + Ok(objects + .map_ok(|object_meta| { + try_into_partitioned_file(object_meta, partition_cols, table_path) + }) + .try_filter_map(move |pf| { + futures::future::ready( + pf.and_then(|pf| filter_partitions(pf, filters, &df_schema)), + ) + }) + .boxed()) + } } /// Extract the partition values for the given `file_path` (in the given `table_path`) @@ -541,22 +458,11 @@ pub fn describe_partition(partition: &Partition) -> (&str, usize, Vec<&str>) { #[cfg(test)] mod tests { - use async_trait::async_trait; - use datafusion_common::config::TableOptions; use datafusion_datasource::file_groups::FileGroup; - use datafusion_execution::config::SessionConfig; - use datafusion_execution::runtime_env::RuntimeEnv; - use futures::FutureExt; - use object_store::memory::InMemory; - use std::any::Any; use std::ops::Not; use super::*; - use datafusion_expr::{ - case, col, lit, AggregateUDF, Expr, LogicalPlan, ScalarUDF, WindowUDF, - }; - use datafusion_physical_expr_common::physical_expr::PhysicalExpr; - use datafusion_physical_plan::ExecutionPlan; + use datafusion_expr::{case, col, lit, Expr}; #[test] fn test_split_files() { @@ -599,209 +505,6 @@ mod tests { assert_eq!(0, chunks.len()); } - #[tokio::test] - async fn test_pruned_partition_list_empty() { - let (store, state) = make_test_store_and_state(&[ - ("tablepath/mypartition=val1/notparquetfile", 100), - ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0), - ("tablepath/file.parquet", 100), - ("tablepath/notapartition/file.parquet", 100), - ("tablepath/notmypartition=val1/file.parquet", 100), - ]); - let filter = Expr::eq(col("mypartition"), lit("val1")); - let pruned = pruned_partition_list( - state.as_ref(), - store.as_ref(), - &ListingTableUrl::parse("file:///tablepath/").unwrap(), - &[filter], - ".parquet", - &[(String::from("mypartition"), DataType::Utf8)], - ) - .await - .expect("partition pruning failed") - .collect::>() - .await; - - assert_eq!(pruned.len(), 0); - } - - #[tokio::test] - async fn test_pruned_partition_list() { - let (store, state) = make_test_store_and_state(&[ - ("tablepath/mypartition=val1/file.parquet", 100), - ("tablepath/mypartition=val2/file.parquet", 100), - ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0), - ("tablepath/mypartition=val1/other=val3/file.parquet", 100), - ("tablepath/notapartition/file.parquet", 100), - ("tablepath/notmypartition=val1/file.parquet", 100), - ]); - let filter = Expr::eq(col("mypartition"), lit("val1")); - let pruned = pruned_partition_list( - state.as_ref(), - store.as_ref(), - &ListingTableUrl::parse("file:///tablepath/").unwrap(), - &[filter], - ".parquet", - &[(String::from("mypartition"), DataType::Utf8)], - ) - .await - .expect("partition pruning failed") - .try_collect::>() - .await - .unwrap(); - - assert_eq!(pruned.len(), 2); - let f1 = &pruned[0]; - assert_eq!( - f1.object_meta.location.as_ref(), - "tablepath/mypartition=val1/file.parquet" - ); - assert_eq!(&f1.partition_values, &[ScalarValue::from("val1")]); - let f2 = &pruned[1]; - assert_eq!( - f2.object_meta.location.as_ref(), - "tablepath/mypartition=val1/other=val3/file.parquet" - ); - assert_eq!(f2.partition_values, &[ScalarValue::from("val1"),]); - } - - #[tokio::test] - async fn test_pruned_partition_list_multi() { - let (store, state) = make_test_store_and_state(&[ - ("tablepath/part1=p1v1/file.parquet", 100), - ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100), - ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100), - ("tablepath/part1=p1v3/part2=p2v1/file2.parquet", 100), - ("tablepath/part1=p1v2/part2=p2v2/file2.parquet", 100), - ]); - let filter1 = Expr::eq(col("part1"), lit("p1v2")); - let filter2 = Expr::eq(col("part2"), lit("p2v1")); - let pruned = pruned_partition_list( - state.as_ref(), - store.as_ref(), - &ListingTableUrl::parse("file:///tablepath/").unwrap(), - &[filter1, filter2], - ".parquet", - &[ - (String::from("part1"), DataType::Utf8), - (String::from("part2"), DataType::Utf8), - ], - ) - .await - .expect("partition pruning failed") - .try_collect::>() - .await - .unwrap(); - - assert_eq!(pruned.len(), 2); - let f1 = &pruned[0]; - assert_eq!( - f1.object_meta.location.as_ref(), - "tablepath/part1=p1v2/part2=p2v1/file1.parquet" - ); - assert_eq!( - &f1.partition_values, - &[ScalarValue::from("p1v2"), ScalarValue::from("p2v1"),] - ); - let f2 = &pruned[1]; - assert_eq!( - f2.object_meta.location.as_ref(), - "tablepath/part1=p1v2/part2=p2v1/file2.parquet" - ); - assert_eq!( - &f2.partition_values, - &[ScalarValue::from("p1v2"), ScalarValue::from("p2v1")] - ); - } - - #[tokio::test] - async fn test_list_partition() { - let (store, _) = make_test_store_and_state(&[ - ("tablepath/part1=p1v1/file.parquet", 100), - ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100), - ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100), - ("tablepath/part1=p1v3/part2=p2v1/file3.parquet", 100), - ("tablepath/part1=p1v2/part2=p2v2/file4.parquet", 100), - ("tablepath/part1=p1v2/part2=p2v2/empty.parquet", 0), - ]); - - let partitions = list_partitions( - store.as_ref(), - &ListingTableUrl::parse("file:///tablepath/").unwrap(), - 0, - None, - ) - .await - .expect("listing partitions failed"); - - assert_eq!( - &partitions - .iter() - .map(describe_partition) - .collect::>(), - &vec![ - ("tablepath", 0, vec![]), - ("tablepath/part1=p1v1", 1, vec![]), - ("tablepath/part1=p1v2", 1, vec![]), - ("tablepath/part1=p1v3", 1, vec![]), - ] - ); - - let partitions = list_partitions( - store.as_ref(), - &ListingTableUrl::parse("file:///tablepath/").unwrap(), - 1, - None, - ) - .await - .expect("listing partitions failed"); - - assert_eq!( - &partitions - .iter() - .map(describe_partition) - .collect::>(), - &vec![ - ("tablepath", 0, vec![]), - ("tablepath/part1=p1v1", 1, vec!["file.parquet"]), - ("tablepath/part1=p1v2", 1, vec![]), - ("tablepath/part1=p1v2/part2=p2v1", 2, vec![]), - ("tablepath/part1=p1v2/part2=p2v2", 2, vec![]), - ("tablepath/part1=p1v3", 1, vec![]), - ("tablepath/part1=p1v3/part2=p2v1", 2, vec![]), - ] - ); - - let partitions = list_partitions( - store.as_ref(), - &ListingTableUrl::parse("file:///tablepath/").unwrap(), - 2, - None, - ) - .await - .expect("listing partitions failed"); - - assert_eq!( - &partitions - .iter() - .map(describe_partition) - .collect::>(), - &vec![ - ("tablepath", 0, vec![]), - ("tablepath/part1=p1v1", 1, vec!["file.parquet"]), - ("tablepath/part1=p1v2", 1, vec![]), - ("tablepath/part1=p1v3", 1, vec![]), - ( - "tablepath/part1=p1v2/part2=p2v1", - 2, - vec!["file1.parquet", "file2.parquet"] - ), - ("tablepath/part1=p1v2/part2=p2v2", 2, vec!["file4.parquet"]), - ("tablepath/part1=p1v3/part2=p2v1", 2, vec!["file3.parquet"]), - ] - ); - } - #[test] fn test_parse_partitions_for_path() { assert_eq!( @@ -1016,86 +719,4 @@ mod tests { Some(Path::from("a=1970-01-05")), ); } - - pub fn make_test_store_and_state( - files: &[(&str, u64)], - ) -> (Arc, Arc) { - let memory = InMemory::new(); - - for (name, size) in files { - memory - .put(&Path::from(*name), vec![0; *size as usize].into()) - .now_or_never() - .unwrap() - .unwrap(); - } - - (Arc::new(memory), Arc::new(MockSession {})) - } - - struct MockSession {} - - #[async_trait] - impl Session for MockSession { - fn session_id(&self) -> &str { - unimplemented!() - } - - fn config(&self) -> &SessionConfig { - unimplemented!() - } - - async fn create_physical_plan( - &self, - _logical_plan: &LogicalPlan, - ) -> Result> { - unimplemented!() - } - - fn create_physical_expr( - &self, - _expr: Expr, - _df_schema: &DFSchema, - ) -> Result> { - unimplemented!() - } - - fn scalar_functions(&self) -> &std::collections::HashMap> { - unimplemented!() - } - - fn aggregate_functions( - &self, - ) -> &std::collections::HashMap> { - unimplemented!() - } - - fn window_functions(&self) -> &std::collections::HashMap> { - unimplemented!() - } - - fn runtime_env(&self) -> &Arc { - unimplemented!() - } - - fn execution_props(&self) -> &ExecutionProps { - unimplemented!() - } - - fn as_any(&self) -> &dyn Any { - unimplemented!() - } - - fn table_options(&self) -> &TableOptions { - unimplemented!() - } - - fn table_options_mut(&mut self) -> &mut TableOptions { - unimplemented!() - } - - fn task_ctx(&self) -> Arc { - unimplemented!() - } - } } diff --git a/datafusion/catalog-listing/src/mod.rs b/datafusion/catalog-listing/src/mod.rs index 90d04b46b806..1e06483261d2 100644 --- a/datafusion/catalog-listing/src/mod.rs +++ b/datafusion/catalog-listing/src/mod.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// https://github.com/apache/datafusion/issues/18503 +#![deny(clippy::needless_pass_by_value)] +#![cfg_attr(test, allow(clippy::needless_pass_by_value))] #![doc( html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" diff --git a/datafusion/catalog-listing/src/options.rs b/datafusion/catalog-listing/src/options.rs index 3cbf3573e951..7da8005f90ec 100644 --- a/datafusion/catalog-listing/src/options.rs +++ b/datafusion/catalog-listing/src/options.rs @@ -100,10 +100,8 @@ impl ListingOptions { /// # use datafusion_catalog_listing::ListingOptions; /// # use datafusion_datasource_parquet::file_format::ParquetFormat; /// - /// let listing_options = ListingOptions::new(Arc::new( - /// ParquetFormat::default() - /// )) - /// .with_file_extension(".parquet"); + /// let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default())) + /// .with_file_extension(".parquet"); /// /// assert_eq!(listing_options.file_extension, ".parquet"); /// ``` @@ -123,10 +121,8 @@ impl ListingOptions { /// # use datafusion_datasource_parquet::file_format::ParquetFormat; /// /// let extension = Some(".parquet"); - /// let listing_options = ListingOptions::new(Arc::new( - /// ParquetFormat::default() - /// )) - /// .with_file_extension_opt(extension); + /// let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default())) + /// .with_file_extension_opt(extension); /// /// assert_eq!(listing_options.file_extension, ".parquet"); /// ``` @@ -216,10 +212,8 @@ impl ListingOptions { /// # use datafusion_catalog_listing::ListingOptions; /// # use datafusion_datasource_parquet::file_format::ParquetFormat; /// - /// let listing_options = ListingOptions::new(Arc::new( - /// ParquetFormat::default() - /// )) - /// .with_collect_stat(true); + /// let listing_options = + /// ListingOptions::new(Arc::new(ParquetFormat::default())).with_collect_stat(true); /// /// assert_eq!(listing_options.collect_stat, true); /// ``` @@ -235,10 +229,8 @@ impl ListingOptions { /// # use datafusion_catalog_listing::ListingOptions; /// # use datafusion_datasource_parquet::file_format::ParquetFormat; /// - /// let listing_options = ListingOptions::new(Arc::new( - /// ParquetFormat::default() - /// )) - /// .with_target_partitions(8); + /// let listing_options = + /// ListingOptions::new(Arc::new(ParquetFormat::default())).with_target_partitions(8); /// /// assert_eq!(listing_options.target_partitions, 8); /// ``` @@ -255,15 +247,11 @@ impl ListingOptions { /// # use datafusion_catalog_listing::ListingOptions; /// # use datafusion_datasource_parquet::file_format::ParquetFormat; /// - /// // Tell datafusion that the files are sorted by column "a" - /// let file_sort_order = vec![vec![ - /// col("a").sort(true, true) - /// ]]; + /// // Tell datafusion that the files are sorted by column "a" + /// let file_sort_order = vec![vec![col("a").sort(true, true)]]; /// - /// let listing_options = ListingOptions::new(Arc::new( - /// ParquetFormat::default() - /// )) - /// .with_file_sort_order(file_sort_order.clone()); + /// let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default())) + /// .with_file_sort_order(file_sort_order.clone()); /// /// assert_eq!(listing_options.file_sort_order, file_sort_order); /// ``` diff --git a/datafusion/catalog-listing/src/table.rs b/datafusion/catalog-listing/src/table.rs index e9ac1bf097a2..33d5c86bf88d 100644 --- a/datafusion/catalog-listing/src/table.rs +++ b/datafusion/catalog-listing/src/table.rs @@ -34,7 +34,7 @@ use datafusion_datasource::schema_adapter::{ DefaultSchemaAdapterFactory, SchemaAdapter, SchemaAdapterFactory, }; use datafusion_datasource::{ - compute_all_files_statistics, ListingTableUrl, PartitionedFile, + compute_all_files_statistics, ListingTableUrl, PartitionedFile, TableSchema, }; use datafusion_execution::cache::cache_manager::FileStatisticsCache; use datafusion_execution::cache::cache_unit::DefaultFileStatisticsCache; @@ -338,7 +338,16 @@ impl ListingTable { fn create_file_source_with_schema_adapter( &self, ) -> datafusion_common::Result> { - let mut source = self.options.format.file_source(); + let table_schema = TableSchema::new( + Arc::clone(&self.file_schema), + self.options + .table_partition_cols + .iter() + .map(|(col, field)| Arc::new(Field::new(col, field.clone(), false))) + .collect(), + ); + + let mut source = self.options.format.file_source(table_schema); // Apply schema adapter to source if available // // The source will use this SchemaAdapter to adapt data batches as they flow up the plan. @@ -418,7 +427,7 @@ impl TableProvider for ListingTable { .options .table_partition_cols .iter() - .map(|col| Ok(self.table_schema.field_with_name(&col.0)?.clone())) + .map(|col| Ok(Arc::new(self.table_schema.field_with_name(&col.0)?.clone()))) .collect::>>()?; let table_partition_col_names = table_partition_cols @@ -491,20 +500,15 @@ impl TableProvider for ListingTable { .format .create_physical_plan( state, - FileScanConfigBuilder::new( - object_store_url, - Arc::clone(&self.file_schema), - file_source, - ) - .with_file_groups(partitioned_file_lists) - .with_constraints(self.constraints.clone()) - .with_statistics(statistics) - .with_projection(projection) - .with_limit(limit) - .with_output_ordering(output_ordering) - .with_table_partition_cols(table_partition_cols) - .with_expr_adapter(self.expr_adapter_factory.clone()) - .build(), + FileScanConfigBuilder::new(object_store_url, file_source) + .with_file_groups(partitioned_file_lists) + .with_constraints(self.constraints.clone()) + .with_statistics(statistics) + .with_projection_indices(projection) + .with_limit(limit) + .with_output_ordering(output_ordering) + .with_expr_adapter(self.expr_adapter_factory.clone()) + .build(), ) .await?; diff --git a/datafusion/catalog/Cargo.toml b/datafusion/catalog/Cargo.toml index a1db45654be0..1009e9aee477 100644 --- a/datafusion/catalog/Cargo.toml +++ b/datafusion/catalog/Cargo.toml @@ -49,5 +49,8 @@ object_store = { workspace = true } parking_lot = { workspace = true } tokio = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/common-runtime/Cargo.toml b/datafusion/common-runtime/Cargo.toml index e53d97b41360..fd9a818bcb1d 100644 --- a/datafusion/common-runtime/Cargo.toml +++ b/datafusion/common-runtime/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/common-runtime/src/lib.rs b/datafusion/common-runtime/src/lib.rs index 5d404d99e776..eb32fead7ecf 100644 --- a/datafusion/common-runtime/src/lib.rs +++ b/datafusion/common-runtime/src/lib.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// https://github.com/apache/datafusion/issues/18503 +#![deny(clippy::needless_pass_by_value)] +#![cfg_attr(test, allow(clippy::needless_pass_by_value))] #![doc( html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg", html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index f5e51cb236d4..b222ae12b92f 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -31,6 +31,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true @@ -66,12 +69,12 @@ half = { workspace = true } hashbrown = { workspace = true } hex = { workspace = true, optional = true } indexmap = { workspace = true } -libc = "0.2.176" +libc = "0.2.177" log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.25", optional = true } +pyo3 = { version = "0.26", optional = true } recursive = { workspace = true, optional = true } sqlparser = { workspace = true, optional = true } tokio = { workspace = true } diff --git a/datafusion/common/src/cast.rs b/datafusion/common/src/cast.rs index e6eda3c585e8..b95167ca1390 100644 --- a/datafusion/common/src/cast.rs +++ b/datafusion/common/src/cast.rs @@ -24,8 +24,8 @@ use crate::{downcast_value, Result}; use arrow::array::{ BinaryViewArray, Decimal32Array, Decimal64Array, DurationMicrosecondArray, DurationMillisecondArray, DurationNanosecondArray, DurationSecondArray, Float16Array, - Int16Array, Int8Array, LargeBinaryArray, LargeStringArray, StringViewArray, - UInt16Array, + Int16Array, Int8Array, LargeBinaryArray, LargeListViewArray, LargeStringArray, + ListViewArray, StringViewArray, UInt16Array, }; use arrow::{ array::{ @@ -324,3 +324,13 @@ pub fn as_generic_string_array( ) -> Result<&GenericStringArray> { Ok(downcast_value!(array, GenericStringArray, T)) } + +// Downcast Array to ListViewArray +pub fn as_list_view_array(array: &dyn Array) -> Result<&ListViewArray> { + Ok(downcast_value!(array, ListViewArray)) +} + +// Downcast Array to LargeListViewArray +pub fn as_large_list_view_array(array: &dyn Array) -> Result<&LargeListViewArray> { + Ok(downcast_value!(array, LargeListViewArray)) +} diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 271ba6ddcff5..0ed499da0475 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -26,14 +26,15 @@ use crate::format::{ExplainAnalyzeLevel, ExplainFormat}; use crate::parsers::CompressionTypeVariant; use crate::utils::get_available_parallelism; use crate::{DataFusionError, Result}; +#[cfg(feature = "parquet_encryption")] +use hex; use std::any::Any; use std::collections::{BTreeMap, HashMap}; use std::error::Error; use std::fmt::{self, Display}; use std::str::FromStr; - #[cfg(feature = "parquet_encryption")] -use hex; +use std::sync::Arc; /// A macro that wraps a configuration struct and automatically derives /// [`Default`] and [`ConfigField`] for it, allowing it to be used @@ -56,7 +57,7 @@ use hex; /// /// Field 3 doc /// field3: Option, default = None /// } -///} +/// } /// ``` /// /// Will generate @@ -466,9 +467,8 @@ config_namespace! { /// The default time zone /// - /// Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime - /// according to this time zone, and then extract the hour - pub time_zone: String, default = "+00:00".into() + /// Some functions, e.g. `now` return timestamps in this time zone + pub time_zone: Option, default = None /// Parquet options pub parquet: ParquetOptions, default = Default::default() @@ -517,6 +517,23 @@ config_namespace! { /// batches and merged. pub sort_in_place_threshold_bytes: usize, default = 1024 * 1024 + /// Maximum size in bytes for individual spill files before rotating to a new file. + /// + /// When operators spill data to disk (e.g., RepartitionExec), they write + /// multiple batches to the same file until this size limit is reached, then rotate + /// to a new file. This reduces syscall overhead compared to one-file-per-batch + /// while preventing files from growing too large. + /// + /// A larger value reduces file creation overhead but may hold more disk space. + /// A smaller value creates more files but allows finer-grained space reclamation + /// as files can be deleted once fully consumed. + /// + /// Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators + /// may create spill files larger than the limit. + /// + /// Default: 128 MB + pub max_spill_file_size_bytes: usize, default = 128 * 1024 * 1024 + /// Number of files to read in parallel when inferring schema and statistics pub meta_fetch_concurrency: usize, default = 32 @@ -620,7 +637,10 @@ config_namespace! { /// bytes of the parquet file optimistically. If not specified, two reads are required: /// One read to fetch the 8-byte parquet footer and /// another to fetch the metadata length encoded in the footer - pub metadata_size_hint: Option, default = None + /// Default setting to 512 KiB, which should be sufficient for most parquet files, + /// it can reduce one I/O operation per parquet file. If the metadata is larger than + /// the hint, two reads will still be performed. + pub metadata_size_hint: Option, default = Some(512 * 1024) /// (reading) If true, filter expressions are be applied during the parquet decoding operation to /// reduce the number of rows decoded. This optimization is sometimes called "late materialization". @@ -1322,36 +1342,35 @@ impl ConfigOptions { /// # Example /// ``` /// use datafusion_common::{ -/// config::ConfigExtension, extensions_options, -/// config::ConfigOptions, +/// config::ConfigExtension, config::ConfigOptions, extensions_options, /// }; -/// // Define a new configuration struct using the `extensions_options` macro -/// extensions_options! { -/// /// My own config options. -/// pub struct MyConfig { -/// /// Should "foo" be replaced by "bar"? -/// pub foo_to_bar: bool, default = true +/// // Define a new configuration struct using the `extensions_options` macro +/// extensions_options! { +/// /// My own config options. +/// pub struct MyConfig { +/// /// Should "foo" be replaced by "bar"? +/// pub foo_to_bar: bool, default = true /// -/// /// How many "baz" should be created? -/// pub baz_count: usize, default = 1337 -/// } -/// } +/// /// How many "baz" should be created? +/// pub baz_count: usize, default = 1337 +/// } +/// } /// -/// impl ConfigExtension for MyConfig { +/// impl ConfigExtension for MyConfig { /// const PREFIX: &'static str = "my_config"; -/// } +/// } /// -/// // set up config struct and register extension -/// let mut config = ConfigOptions::default(); -/// config.extensions.insert(MyConfig::default()); +/// // set up config struct and register extension +/// let mut config = ConfigOptions::default(); +/// config.extensions.insert(MyConfig::default()); /// -/// // overwrite config default -/// config.set("my_config.baz_count", "42").unwrap(); +/// // overwrite config default +/// config.set("my_config.baz_count", "42").unwrap(); /// -/// // check config state -/// let my_config = config.extensions.get::().unwrap(); -/// assert!(my_config.foo_to_bar,); -/// assert_eq!(my_config.baz_count, 42,); +/// // check config state +/// let my_config = config.extensions.get::().unwrap(); +/// assert!(my_config.foo_to_bar,); +/// assert_eq!(my_config.baz_count, 42,); /// ``` /// /// # Note: @@ -2409,13 +2428,13 @@ impl From for FileEncryptionProperties { hex::decode(&val.aad_prefix_as_hex).expect("Invalid AAD prefix"); fep = fep.with_aad_prefix(aad_prefix); } - fep.build().unwrap() + Arc::unwrap_or_clone(fep.build().unwrap()) } } #[cfg(feature = "parquet_encryption")] -impl From<&FileEncryptionProperties> for ConfigFileEncryptionProperties { - fn from(f: &FileEncryptionProperties) -> Self { +impl From<&Arc> for ConfigFileEncryptionProperties { + fn from(f: &Arc) -> Self { let (column_names_vec, column_keys_vec, column_metas_vec) = f.column_keys(); let mut column_encryption_properties: HashMap< @@ -2557,13 +2576,13 @@ impl From for FileDecryptionProperties { fep = fep.with_aad_prefix(aad_prefix); } - fep.build().unwrap() + Arc::unwrap_or_clone(fep.build().unwrap()) } } #[cfg(feature = "parquet_encryption")] -impl From<&FileDecryptionProperties> for ConfigFileDecryptionProperties { - fn from(f: &FileDecryptionProperties) -> Self { +impl From<&Arc> for ConfigFileDecryptionProperties { + fn from(f: &Arc) -> Self { let (column_names_vec, column_keys_vec) = f.column_keys(); let mut column_decryption_properties: HashMap< String, @@ -2834,6 +2853,7 @@ mod tests { }; use std::any::Any; use std::collections::HashMap; + use std::sync::Arc; #[derive(Default, Debug, Clone)] pub struct TestExtensionConfig { @@ -2990,16 +3010,15 @@ mod tests { .unwrap(); // Test round-trip - let config_encrypt: ConfigFileEncryptionProperties = - (&file_encryption_properties).into(); - let encryption_properties_built: FileEncryptionProperties = - config_encrypt.clone().into(); + let config_encrypt = + ConfigFileEncryptionProperties::from(&file_encryption_properties); + let encryption_properties_built = + Arc::new(FileEncryptionProperties::from(config_encrypt.clone())); assert_eq!(file_encryption_properties, encryption_properties_built); - let config_decrypt: ConfigFileDecryptionProperties = - (&decryption_properties).into(); - let decryption_properties_built: FileDecryptionProperties = - config_decrypt.clone().into(); + let config_decrypt = ConfigFileDecryptionProperties::from(&decryption_properties); + let decryption_properties_built = + Arc::new(FileDecryptionProperties::from(config_decrypt.clone())); assert_eq!(decryption_properties, decryption_properties_built); /////////////////////////////////////////////////////////////////////////////////// diff --git a/datafusion/common/src/datatype.rs b/datafusion/common/src/datatype.rs index 544ec0c2468c..65f639521186 100644 --- a/datafusion/common/src/datatype.rs +++ b/datafusion/common/src/datatype.rs @@ -81,7 +81,6 @@ pub trait FieldExt { /// assert_eq!(list_field.data_type(), &DataType::List(Arc::new( /// Field::new("item", DataType::Int32, true) /// ))); - /// fn into_list(self) -> Self; /// Return a new Field representing this Field as the item type of a @@ -107,7 +106,6 @@ pub trait FieldExt { /// Field::new("item", DataType::Int32, true)), /// 3 /// )); - /// fn into_fixed_size_list(self, list_size: i32) -> Self; /// Update the field to have the default list field name ("item") diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 6866b4011f9e..24d152a7dba8 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -56,12 +56,10 @@ pub type DFSchemaRef = Arc; /// an Arrow schema. /// /// ```rust -/// use datafusion_common::{DFSchema, Column}; /// use arrow::datatypes::{DataType, Field, Schema}; +/// use datafusion_common::{Column, DFSchema}; /// -/// let arrow_schema = Schema::new(vec![ -/// Field::new("c1", DataType::Int32, false), -/// ]); +/// let arrow_schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); /// /// let df_schema = DFSchema::try_from_qualified_schema("t1", &arrow_schema).unwrap(); /// let column = Column::from_qualified_name("t1.c1"); @@ -77,12 +75,10 @@ pub type DFSchemaRef = Arc; /// Create an unqualified schema using TryFrom: /// /// ```rust -/// use datafusion_common::{DFSchema, Column}; /// use arrow::datatypes::{DataType, Field, Schema}; +/// use datafusion_common::{Column, DFSchema}; /// -/// let arrow_schema = Schema::new(vec![ -/// Field::new("c1", DataType::Int32, false), -/// ]); +/// let arrow_schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); /// /// let df_schema = DFSchema::try_from(arrow_schema).unwrap(); /// let column = Column::new_unqualified("c1"); @@ -94,13 +90,15 @@ pub type DFSchemaRef = Arc; /// Use the `Into` trait to convert `DFSchema` into an Arrow schema: /// /// ```rust +/// use arrow::datatypes::{Field, Schema}; /// use datafusion_common::DFSchema; -/// use arrow::datatypes::{Schema, Field}; /// use std::collections::HashMap; /// -/// let df_schema = DFSchema::from_unqualified_fields(vec![ -/// Field::new("c1", arrow::datatypes::DataType::Int32, false), -/// ].into(),HashMap::new()).unwrap(); +/// let df_schema = DFSchema::from_unqualified_fields( +/// vec![Field::new("c1", arrow::datatypes::DataType::Int32, false)].into(), +/// HashMap::new(), +/// ) +/// .unwrap(); /// let schema: &Schema = df_schema.as_arrow(); /// assert_eq!(schema.fields().len(), 1); /// ``` @@ -884,22 +882,26 @@ impl DFSchema { /// # Example /// /// ``` - /// use datafusion_common::DFSchema; /// use arrow::datatypes::{DataType, Field, Schema}; + /// use datafusion_common::DFSchema; /// use std::collections::HashMap; /// /// let schema = DFSchema::from_unqualified_fields( /// vec![ /// Field::new("id", DataType::Int32, false), /// Field::new("name", DataType::Utf8, true), - /// ].into(), - /// HashMap::new() - /// ).unwrap(); + /// ] + /// .into(), + /// HashMap::new(), + /// ) + /// .unwrap(); /// - /// assert_eq!(schema.tree_string().to_string(), - /// r#"root + /// assert_eq!( + /// schema.tree_string().to_string(), + /// r#"root /// |-- id: int32 (nullable = false) - /// |-- name: utf8 (nullable = true)"#); + /// |-- name: utf8 (nullable = true)"# + /// ); /// ``` pub fn tree_string(&self) -> impl Display + '_ { let mut result = String::from("root\n"); @@ -1417,7 +1419,7 @@ mod tests { fn from_qualified_schema_into_arrow_schema() -> Result<()> { let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; let arrow_schema = schema.as_arrow(); - insta::assert_snapshot!(arrow_schema, @r#"Field { name: "c0", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "c1", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }"#); + insta::assert_snapshot!(arrow_schema.to_string(), @r#"Field { "c0": nullable Boolean }, Field { "c1": nullable Boolean }"#); Ok(()) } diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs index 0dce8e6a56ec..b25bf1c12e44 100644 --- a/datafusion/common/src/diagnostic.rs +++ b/datafusion/common/src/diagnostic.rs @@ -30,8 +30,11 @@ use crate::Span; /// ```rust /// # use datafusion_common::{Location, Span, Diagnostic}; /// let span = Some(Span { -/// start: Location{ line: 2, column: 1 }, -/// end: Location{ line: 4, column: 15 } +/// start: Location { line: 2, column: 1 }, +/// end: Location { +/// line: 4, +/// column: 15, +/// }, /// }); /// let diagnostic = Diagnostic::new_error("Something went wrong", span) /// .with_help("Have you tried turning it on and off again?", None); diff --git a/datafusion/common/src/encryption.rs b/datafusion/common/src/encryption.rs index b764ad77cff1..2a8cfdbc8996 100644 --- a/datafusion/common/src/encryption.rs +++ b/datafusion/common/src/encryption.rs @@ -24,38 +24,10 @@ pub use parquet::encryption::decrypt::FileDecryptionProperties; pub use parquet::encryption::encrypt::FileEncryptionProperties; #[cfg(not(feature = "parquet_encryption"))] -#[derive(Default, Debug)] +#[derive(Default, Clone, Debug)] pub struct FileDecryptionProperties; #[cfg(not(feature = "parquet_encryption"))] -#[derive(Default, Debug)] +#[derive(Default, Clone, Debug)] pub struct FileEncryptionProperties; pub use crate::config::{ConfigFileDecryptionProperties, ConfigFileEncryptionProperties}; - -#[cfg(feature = "parquet_encryption")] -pub fn map_encryption_to_config_encryption( - encryption: Option<&FileEncryptionProperties>, -) -> Option { - encryption.map(|fe| fe.into()) -} - -#[cfg(not(feature = "parquet_encryption"))] -pub fn map_encryption_to_config_encryption( - _encryption: Option<&FileEncryptionProperties>, -) -> Option { - None -} - -#[cfg(feature = "parquet_encryption")] -pub fn map_config_decryption_to_decryption( - decryption: &ConfigFileDecryptionProperties, -) -> FileDecryptionProperties { - decryption.clone().into() -} - -#[cfg(not(feature = "parquet_encryption"))] -pub fn map_config_decryption_to_decryption( - _decryption: &ConfigFileDecryptionProperties, -) -> FileDecryptionProperties { - FileDecryptionProperties {} -} diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 210f0442972d..4fa6d28e7324 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -684,7 +684,10 @@ impl DataFusionError { /// let mut builder = DataFusionError::builder(); /// builder.add_error(DataFusionError::Internal("foo".to_owned())); /// // ok_or returns the value if no errors have been added -/// assert_contains!(builder.error_or(42).unwrap_err().to_string(), "Internal error: foo"); +/// assert_contains!( +/// builder.error_or(42).unwrap_err().to_string(), +/// "Internal error: foo" +/// ); /// ``` #[derive(Debug, Default)] pub struct DataFusionErrorBuilder(Vec); @@ -702,7 +705,10 @@ impl DataFusionErrorBuilder { /// # use datafusion_common::{assert_contains, DataFusionError}; /// let mut builder = DataFusionError::builder(); /// builder.add_error(DataFusionError::Internal("foo".to_owned())); - /// assert_contains!(builder.error_or(42).unwrap_err().to_string(), "Internal error: foo"); + /// assert_contains!( + /// builder.error_or(42).unwrap_err().to_string(), + /// "Internal error: foo" + /// ); /// ``` pub fn add_error(&mut self, error: DataFusionError) { self.0.push(error); @@ -714,8 +720,11 @@ impl DataFusionErrorBuilder { /// ``` /// # use datafusion_common::{assert_contains, DataFusionError}; /// let builder = DataFusionError::builder() - /// .with_error(DataFusionError::Internal("foo".to_owned())); - /// assert_contains!(builder.error_or(42).unwrap_err().to_string(), "Internal error: foo"); + /// .with_error(DataFusionError::Internal("foo".to_owned())); + /// assert_contains!( + /// builder.error_or(42).unwrap_err().to_string(), + /// "Internal error: foo" + /// ); /// ``` pub fn with_error(mut self, error: DataFusionError) -> Self { self.0.push(error); @@ -749,6 +758,116 @@ macro_rules! unwrap_or_internal_err { }; } +/// Assert a condition, returning `DataFusionError::Internal` on failure. +/// +/// # Examples +/// +/// ```text +/// assert_or_internal_err!(predicate); +/// assert_or_internal_err!(predicate, "human readable message"); +/// assert_or_internal_err!(predicate, format!("details: {}", value)); +/// ``` +#[macro_export] +macro_rules! assert_or_internal_err { + ($cond:expr) => { + if !$cond { + return Err(DataFusionError::Internal(format!( + "Assertion failed: {}", + stringify!($cond) + ))); + } + }; + ($cond:expr, $($arg:tt)+) => { + if !$cond { + return Err(DataFusionError::Internal(format!( + "Assertion failed: {}: {}", + stringify!($cond), + format!($($arg)+) + ))); + } + }; +} + +/// Assert equality, returning `DataFusionError::Internal` on failure. +/// +/// # Examples +/// +/// ```text +/// assert_eq_or_internal_err!(actual, expected); +/// assert_eq_or_internal_err!(left_expr, right_expr, "values must match"); +/// assert_eq_or_internal_err!(lhs, rhs, "metadata: {}", extra); +/// ``` +#[macro_export] +macro_rules! assert_eq_or_internal_err { + ($left:expr, $right:expr $(,)?) => {{ + let left_val = &$left; + let right_val = &$right; + if left_val != right_val { + return Err(DataFusionError::Internal(format!( + "Assertion failed: {} == {} (left: {:?}, right: {:?})", + stringify!($left), + stringify!($right), + left_val, + right_val + ))); + } + }}; + ($left:expr, $right:expr, $($arg:tt)+) => {{ + let left_val = &$left; + let right_val = &$right; + if left_val != right_val { + return Err(DataFusionError::Internal(format!( + "Assertion failed: {} == {} (left: {:?}, right: {:?}): {}", + stringify!($left), + stringify!($right), + left_val, + right_val, + format!($($arg)+) + ))); + } + }}; +} + +/// Assert inequality, returning `DataFusionError::Internal` on failure. +/// +/// # Examples +/// +/// ```text +/// assert_ne_or_internal_err!(left, right); +/// assert_ne_or_internal_err!(lhs_expr, rhs_expr, "values must differ"); +/// assert_ne_or_internal_err!(a, b, "context {}", info); +/// ``` +#[macro_export] +macro_rules! assert_ne_or_internal_err { + ($left:expr, $right:expr $(,)?) => {{ + let left_val = &$left; + let right_val = &$right; + if left_val == right_val { + return Err(DataFusionError::Internal(format!( + "Assertion failed: {} != {} (left: {:?}, right: {:?})", + stringify!($left), + stringify!($right), + left_val, + right_val + ))); + } + }}; + ($left:expr, $right:expr, $($arg:tt)+) => {{ + let left_val = &$left; + let right_val = &$right; + if left_val == right_val { + return Err(DataFusionError::Internal(format!( + "Assertion failed: {} != {} (left: {:?}, right: {:?}): {}", + stringify!($left), + stringify!($right), + left_val, + right_val, + format!($($arg)+) + ))); + } + }}; +} + /// Add a macros for concise DataFusionError::* errors declaration /// supports placeholders the same way as `format!` /// Examples: @@ -965,6 +1084,115 @@ mod test { use std::sync::Arc; use arrow::error::ArrowError; + use insta::assert_snapshot; + + fn ok_result() -> Result<()> { + Ok(()) + } + + #[test] + fn test_assert_eq_or_internal_err_passes() -> Result<()> { + assert_eq_or_internal_err!(1, 1); + ok_result() + } + + #[test] + fn test_assert_eq_or_internal_err_fails() { + fn check() -> Result<()> { + assert_eq_or_internal_err!(1, 2, "expected equality"); + ok_result() + } + + let err = check().unwrap_err(); + assert_snapshot!( + err.to_string(), + @r" + Internal error: Assertion failed: 1 == 2 (left: 1, right: 2): expected equality. + This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues + " + ); + } + + #[test] + fn test_assert_ne_or_internal_err_passes() -> Result<()> { + assert_ne_or_internal_err!(1, 2); + ok_result() + } + + #[test] + fn test_assert_ne_or_internal_err_fails() { + fn check() -> Result<()> { + assert_ne_or_internal_err!(3, 3, "values must differ"); + ok_result() + } + + let err = check().unwrap_err(); + assert_snapshot!( + err.to_string(), + @r" + Internal error: Assertion failed: 3 != 3 (left: 3, right: 3): values must differ. + This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues + " + ); + } + + #[test] + fn test_assert_or_internal_err_passes() -> Result<()> { + assert_or_internal_err!(true); + assert_or_internal_err!(true, "message"); + ok_result() + } + + #[test] + fn test_assert_or_internal_err_fails_default() { + fn check() -> Result<()> { + assert_or_internal_err!(false); + ok_result() + } + + let err = check().unwrap_err(); + assert_snapshot!( + err.to_string(), + @r" + Internal error: Assertion failed: false. + This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues + " + ); + } + + #[test] + fn test_assert_or_internal_err_fails_with_message() { + fn check() -> Result<()> { + assert_or_internal_err!(false, "custom message"); + ok_result() + } + + let err = check().unwrap_err(); + assert_snapshot!( + err.to_string(), + @r" + Internal error: Assertion failed: false: custom message. + This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues + " + ); + } + + #[test] + fn test_assert_or_internal_err_with_format_arguments() { + fn check() -> Result<()> { + assert_or_internal_err!(false, "custom {}", 42); + ok_result() + } + + let err = check().unwrap_err(); + assert_snapshot!( + err.to_string(), + @r" + Internal error: Assertion failed: false: custom 42. + This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues + " + ); + } #[test] fn test_error_size() { diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index 3977f2b489e1..564929c61bab 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -402,15 +402,14 @@ pub(crate) fn parse_statistics_string(str_setting: &str) -> Result( /// with the new hash using `combine_hashes` #[cfg(not(feature = "force_hash_collisions"))] fn hash_array( - array: T, + array: &T, random_state: &RandomState, hashes_buffer: &mut [u64], rehash: bool, @@ -215,12 +212,11 @@ fn hash_dictionary( // Hash each dictionary value once, and then use that computed // hash for each key value to avoid a potentially expensive // redundant hashing for large dictionary elements (e.g. strings) - let dict_values = Arc::clone(array.values()); + let dict_values = array.values(); let mut dict_hashes = vec![0; dict_values.len()]; - create_hashes(&[dict_values], random_state, &mut dict_hashes)?; + create_hashes([dict_values], random_state, &mut dict_hashes)?; // combine hash for each index in values - let dict_values = array.values(); for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) { if let Some(key) = key { let idx = key.as_usize(); @@ -308,11 +304,11 @@ fn hash_list_array( where OffsetSize: OffsetSizeTrait, { - let values = Arc::clone(array.values()); + let values = array.values(); let offsets = array.value_offsets(); let nulls = array.nulls(); let mut values_hashes = vec![0u64; values.len()]; - create_hashes(&[values], random_state, &mut values_hashes)?; + create_hashes([values], random_state, &mut values_hashes)?; if let Some(nulls) = nulls { for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() { if nulls.is_valid(i) { @@ -339,11 +335,11 @@ fn hash_fixed_list_array( random_state: &RandomState, hashes_buffer: &mut [u64], ) -> Result<()> { - let values = Arc::clone(array.values()); + let values = array.values(); let value_length = array.value_length() as usize; let nulls = array.nulls(); let mut values_hashes = vec![0u64; values.len()]; - create_hashes(&[values], random_state, &mut values_hashes)?; + create_hashes([values], random_state, &mut values_hashes)?; if let Some(nulls) = nulls { for i in 0..array.len() { if nulls.is_valid(i) { @@ -366,83 +362,132 @@ fn hash_fixed_list_array( Ok(()) } -/// Test version of `create_hashes` that produces the same value for -/// all hashes (to test collisions) -/// -/// See comments on `hashes_buffer` for more details +/// Internal helper function that hashes a single array and either initializes or combines +/// the hash values in the buffer. +#[cfg(not(feature = "force_hash_collisions"))] +fn hash_single_array( + array: &dyn Array, + random_state: &RandomState, + hashes_buffer: &mut [u64], + rehash: bool, +) -> Result<()> { + downcast_primitive_array! { + array => hash_array_primitive(array, random_state, hashes_buffer, rehash), + DataType::Null => hash_null(random_state, hashes_buffer, rehash), + DataType::Boolean => hash_array(&as_boolean_array(array)?, random_state, hashes_buffer, rehash), + DataType::Utf8 => hash_array(&as_string_array(array)?, random_state, hashes_buffer, rehash), + DataType::Utf8View => hash_array(&as_string_view_array(array)?, random_state, hashes_buffer, rehash), + DataType::LargeUtf8 => hash_array(&as_largestring_array(array), random_state, hashes_buffer, rehash), + DataType::Binary => hash_array(&as_generic_binary_array::(array)?, random_state, hashes_buffer, rehash), + DataType::BinaryView => hash_array(&as_binary_view_array(array)?, random_state, hashes_buffer, rehash), + DataType::LargeBinary => hash_array(&as_generic_binary_array::(array)?, random_state, hashes_buffer, rehash), + DataType::FixedSizeBinary(_) => { + let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap(); + hash_array(&array, random_state, hashes_buffer, rehash) + } + DataType::Dictionary(_, _) => downcast_dictionary_array! { + array => hash_dictionary(array, random_state, hashes_buffer, rehash)?, + _ => unreachable!() + } + DataType::Struct(_) => { + let array = as_struct_array(array)?; + hash_struct_array(array, random_state, hashes_buffer)?; + } + DataType::List(_) => { + let array = as_list_array(array)?; + hash_list_array(array, random_state, hashes_buffer)?; + } + DataType::LargeList(_) => { + let array = as_large_list_array(array)?; + hash_list_array(array, random_state, hashes_buffer)?; + } + DataType::Map(_, _) => { + let array = as_map_array(array)?; + hash_map_array(array, random_state, hashes_buffer)?; + } + DataType::FixedSizeList(_,_) => { + let array = as_fixed_size_list_array(array)?; + hash_fixed_list_array(array, random_state, hashes_buffer)?; + } + _ => { + // This is internal because we should have caught this before. + return _internal_err!( + "Unsupported data type in hasher: {}", + array.data_type() + ); + } + } + Ok(()) +} + +/// Test version of `hash_single_array` that forces all hashes to collide to zero. #[cfg(feature = "force_hash_collisions")] -pub fn create_hashes<'a>( - _arrays: &[ArrayRef], +fn hash_single_array( + _array: &dyn Array, _random_state: &RandomState, - hashes_buffer: &'a mut Vec, -) -> Result<&'a mut Vec> { + hashes_buffer: &mut [u64], + _rehash: bool, +) -> Result<()> { for hash in hashes_buffer.iter_mut() { *hash = 0 } - Ok(hashes_buffer) + Ok(()) +} + +/// Something that can be returned as a `&dyn Array`. +/// +/// We want `create_hashes` to accept either `&dyn Array` or `ArrayRef`, +/// and this seems the best way to do so. +/// +/// We tried having it accept `AsRef` +/// but that is not implemented for and cannot be implemented for +/// `&dyn Array` so callers that have the latter would not be able +/// to call `create_hashes` directly. This shim trait makes it possible. +pub trait AsDynArray { + fn as_dyn_array(&self) -> &dyn Array; +} + +impl AsDynArray for dyn Array { + fn as_dyn_array(&self) -> &dyn Array { + self + } +} + +impl AsDynArray for &dyn Array { + fn as_dyn_array(&self) -> &dyn Array { + *self + } +} + +impl AsDynArray for ArrayRef { + fn as_dyn_array(&self) -> &dyn Array { + self.as_ref() + } } -/// Creates hash values for every row, based on the values in the -/// columns. +impl AsDynArray for &ArrayRef { + fn as_dyn_array(&self) -> &dyn Array { + self.as_ref() + } +} + +/// Creates hash values for every row, based on the values in the columns. /// /// The number of rows to hash is determined by `hashes_buffer.len()`. -/// `hashes_buffer` should be pre-sized appropriately -#[cfg(not(feature = "force_hash_collisions"))] -pub fn create_hashes<'a>( - arrays: &[ArrayRef], +/// `hashes_buffer` should be pre-sized appropriately. +pub fn create_hashes<'a, I, T>( + arrays: I, random_state: &RandomState, hashes_buffer: &'a mut Vec, -) -> Result<&'a mut Vec> { - for (i, col) in arrays.iter().enumerate() { - let array = col.as_ref(); +) -> Result<&'a mut Vec> +where + I: IntoIterator, + T: AsDynArray, +{ + for (i, array) in arrays.into_iter().enumerate() { // combine hashes with `combine_hashes` for all columns besides the first let rehash = i >= 1; - downcast_primitive_array! { - array => hash_array_primitive(array, random_state, hashes_buffer, rehash), - DataType::Null => hash_null(random_state, hashes_buffer, rehash), - DataType::Boolean => hash_array(as_boolean_array(array)?, random_state, hashes_buffer, rehash), - DataType::Utf8 => hash_array(as_string_array(array)?, random_state, hashes_buffer, rehash), - DataType::Utf8View => hash_array(as_string_view_array(array)?, random_state, hashes_buffer, rehash), - DataType::LargeUtf8 => hash_array(as_largestring_array(array), random_state, hashes_buffer, rehash), - DataType::Binary => hash_array(as_generic_binary_array::(array)?, random_state, hashes_buffer, rehash), - DataType::BinaryView => hash_array(as_binary_view_array(array)?, random_state, hashes_buffer, rehash), - DataType::LargeBinary => hash_array(as_generic_binary_array::(array)?, random_state, hashes_buffer, rehash), - DataType::FixedSizeBinary(_) => { - let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap(); - hash_array(array, random_state, hashes_buffer, rehash) - } - DataType::Dictionary(_, _) => downcast_dictionary_array! { - array => hash_dictionary(array, random_state, hashes_buffer, rehash)?, - _ => unreachable!() - } - DataType::Struct(_) => { - let array = as_struct_array(array)?; - hash_struct_array(array, random_state, hashes_buffer)?; - } - DataType::List(_) => { - let array = as_list_array(array)?; - hash_list_array(array, random_state, hashes_buffer)?; - } - DataType::LargeList(_) => { - let array = as_large_list_array(array)?; - hash_list_array(array, random_state, hashes_buffer)?; - } - DataType::Map(_, _) => { - let array = as_map_array(array)?; - hash_map_array(array, random_state, hashes_buffer)?; - } - DataType::FixedSizeList(_,_) => { - let array = as_fixed_size_list_array(array)?; - hash_fixed_list_array(array, random_state, hashes_buffer)?; - } - _ => { - // This is internal because we should have caught this before. - return _internal_err!( - "Unsupported data type in hasher: {}", - col.data_type() - ); - } - } + hash_single_array(array.as_dyn_array(), random_state, hashes_buffer, rehash)?; } Ok(hashes_buffer) } @@ -465,7 +510,7 @@ mod tests { .collect::() .with_precision_and_scale(20, 3) .unwrap(); - let array_ref = Arc::new(array); + let array_ref: ArrayRef = Arc::new(array); let random_state = RandomState::with_seeds(0, 0, 0, 0); let hashes_buff = &mut vec![0; array_ref.len()]; let hashes = create_hashes(&[array_ref], &random_state, hashes_buff)?; @@ -478,15 +523,21 @@ mod tests { let empty_array = FixedSizeListBuilder::new(StringBuilder::new(), 1).finish(); let random_state = RandomState::with_seeds(0, 0, 0, 0); let hashes_buff = &mut vec![0; 0]; - let hashes = create_hashes(&[Arc::new(empty_array)], &random_state, hashes_buff)?; + let hashes = create_hashes( + &[Arc::new(empty_array) as ArrayRef], + &random_state, + hashes_buff, + )?; assert_eq!(hashes, &Vec::::new()); Ok(()) } #[test] fn create_hashes_for_float_arrays() -> Result<()> { - let f32_arr = Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7])); - let f64_arr = Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7])); + let f32_arr: ArrayRef = + Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7])); + let f64_arr: ArrayRef = + Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7])); let random_state = RandomState::with_seeds(0, 0, 0, 0); let hashes_buff = &mut vec![0; f32_arr.len()]; @@ -514,8 +565,10 @@ mod tests { Some(b"Longer than 12 bytes string"), ]; - let binary_array = Arc::new(binary.iter().cloned().collect::<$ARRAY>()); - let ref_array = Arc::new(binary.iter().cloned().collect::()); + let binary_array: ArrayRef = + Arc::new(binary.iter().cloned().collect::<$ARRAY>()); + let ref_array: ArrayRef = + Arc::new(binary.iter().cloned().collect::()); let random_state = RandomState::with_seeds(0, 0, 0, 0); @@ -553,7 +606,7 @@ mod tests { #[test] fn create_hashes_fixed_size_binary() -> Result<()> { let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]]; - let fixed_size_binary_array = + let fixed_size_binary_array: ArrayRef = Arc::new(FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap()); let random_state = RandomState::with_seeds(0, 0, 0, 0); @@ -580,8 +633,9 @@ mod tests { Some("Longer than 12 bytes string"), ]; - let string_array = Arc::new(strings.iter().cloned().collect::<$ARRAY>()); - let dict_array = Arc::new( + let string_array: ArrayRef = + Arc::new(strings.iter().cloned().collect::<$ARRAY>()); + let dict_array: ArrayRef = Arc::new( strings .iter() .cloned() @@ -629,8 +683,9 @@ mod tests { fn create_hashes_for_dict_arrays() { let strings = [Some("foo"), None, Some("bar"), Some("foo"), None]; - let string_array = Arc::new(strings.iter().cloned().collect::()); - let dict_array = Arc::new( + let string_array: ArrayRef = + Arc::new(strings.iter().cloned().collect::()); + let dict_array: ArrayRef = Arc::new( strings .iter() .cloned() @@ -865,8 +920,9 @@ mod tests { let strings1 = [Some("foo"), None, Some("bar")]; let strings2 = [Some("blarg"), Some("blah"), None]; - let string_array = Arc::new(strings1.iter().cloned().collect::()); - let dict_array = Arc::new( + let string_array: ArrayRef = + Arc::new(strings1.iter().cloned().collect::()); + let dict_array: ArrayRef = Arc::new( strings2 .iter() .cloned() @@ -896,4 +952,52 @@ mod tests { assert_ne!(one_col_hashes, two_col_hashes); } + + #[test] + fn test_create_hashes_from_arrays() { + let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4])); + let float_array: ArrayRef = + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])); + + let random_state = RandomState::with_seeds(0, 0, 0, 0); + let hashes_buff = &mut vec![0; int_array.len()]; + let hashes = + create_hashes(&[int_array, float_array], &random_state, hashes_buff).unwrap(); + assert_eq!(hashes.len(), 4,); + } + + #[test] + fn test_create_hashes_from_dyn_arrays() { + let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4])); + let float_array: ArrayRef = + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])); + + // Verify that we can call create_hashes with only &dyn Array + fn test(arr1: &dyn Array, arr2: &dyn Array) { + let random_state = RandomState::with_seeds(0, 0, 0, 0); + let hashes_buff = &mut vec![0; arr1.len()]; + let hashes = create_hashes([arr1, arr2], &random_state, hashes_buff).unwrap(); + assert_eq!(hashes.len(), 4,); + } + test(&*int_array, &*float_array); + } + + #[test] + fn test_create_hashes_equivalence() { + let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4])); + let random_state = RandomState::with_seeds(0, 0, 0, 0); + + let mut hashes1 = vec![0; array.len()]; + create_hashes( + &[Arc::clone(&array) as ArrayRef], + &random_state, + &mut hashes1, + ) + .unwrap(); + + let mut hashes2 = vec![0; array.len()]; + create_hashes([array], &random_state, &mut hashes2).unwrap(); + + assert_eq!(hashes1, hashes2); + } } diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index 76c7b46e3273..549c265024f9 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -23,6 +23,9 @@ // Make sure fast / cheap clones on Arc are explicit: // https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] +// https://github.com/apache/datafusion/issues/18503 +#![deny(clippy::needless_pass_by_value)] +#![cfg_attr(test, allow(clippy::needless_pass_by_value))] mod column; mod dfschema; diff --git a/datafusion/common/src/metadata.rs b/datafusion/common/src/metadata.rs index 39065808efb9..3a10cc2b42f9 100644 --- a/datafusion/common/src/metadata.rs +++ b/datafusion/common/src/metadata.rs @@ -171,7 +171,6 @@ pub fn format_type_and_metadata( /// // Add any metadata from `FieldMetadata` to `Field` /// let updated_field = metadata.add_to_field(field); /// ``` -/// #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] pub struct FieldMetadata { /// The inner metadata of a literal expression, which is a map of string diff --git a/datafusion/common/src/nested_struct.rs b/datafusion/common/src/nested_struct.rs index 38060e370bfa..d43816f75b0e 100644 --- a/datafusion/common/src/nested_struct.rs +++ b/datafusion/common/src/nested_struct.rs @@ -110,16 +110,19 @@ fn cast_struct_column( /// temporal values are formatted when cast to strings. /// /// ``` -/// use std::sync::Arc; -/// use arrow::array::{Int64Array, ArrayRef}; +/// use arrow::array::{ArrayRef, Int64Array}; /// use arrow::compute::CastOptions; /// use arrow::datatypes::{DataType, Field}; /// use datafusion_common::nested_struct::cast_column; +/// use std::sync::Arc; /// /// let source: ArrayRef = Arc::new(Int64Array::from(vec![1, i64::MAX])); /// let target = Field::new("ints", DataType::Int32, true); /// // Permit lossy conversions by producing NULL on overflow instead of erroring -/// let options = CastOptions { safe: true, ..Default::default() }; +/// let options = CastOptions { +/// safe: true, +/// ..Default::default() +/// }; /// let result = cast_column(&source, &target, &options).unwrap(); /// assert!(result.is_null(1)); /// ``` diff --git a/datafusion/common/src/pyarrow.rs b/datafusion/common/src/pyarrow.rs index ff413e08ab07..18c6739735ff 100644 --- a/datafusion/common/src/pyarrow.rs +++ b/datafusion/common/src/pyarrow.rs @@ -22,7 +22,7 @@ use arrow::pyarrow::{FromPyArrow, ToPyArrow}; use pyo3::exceptions::PyException; use pyo3::prelude::PyErr; use pyo3::types::{PyAnyMethods, PyList}; -use pyo3::{Bound, FromPyObject, IntoPyObject, PyAny, PyObject, PyResult, Python}; +use pyo3::{Bound, FromPyObject, IntoPyObject, PyAny, PyResult, Python}; use crate::{DataFusionError, ScalarValue}; @@ -52,11 +52,11 @@ impl FromPyArrow for ScalarValue { } impl ToPyArrow for ScalarValue { - fn to_pyarrow(&self, py: Python) -> PyResult { + fn to_pyarrow<'py>(&self, py: Python<'py>) -> PyResult> { let array = self.to_array()?; // convert to pyarrow array using C data interface let pyarray = array.to_data().to_pyarrow(py)?; - let pyscalar = pyarray.call_method1(py, "__getitem__", (0,))?; + let pyscalar = pyarray.call_method1("__getitem__", (0,))?; Ok(pyscalar) } @@ -79,23 +79,22 @@ impl<'source> IntoPyObject<'source> for ScalarValue { let array = self.to_array()?; // convert to pyarrow array using C data interface let pyarray = array.to_data().to_pyarrow(py)?; - let pyarray_bound = pyarray.bind(py); - pyarray_bound.call_method1("__getitem__", (0,)) + pyarray.call_method1("__getitem__", (0,)) } } #[cfg(test)] mod tests { use pyo3::ffi::c_str; - use pyo3::prepare_freethreaded_python; use pyo3::py_run; use pyo3::types::PyDict; + use pyo3::Python; use super::*; fn init_python() { - prepare_freethreaded_python(); - Python::with_gil(|py| { + Python::initialize(); + Python::attach(|py| { if py.run(c_str!("import pyarrow"), None, None).is_err() { let locals = PyDict::new(py); py.run( @@ -127,7 +126,7 @@ mod tests { fn test_roundtrip() { init_python(); - let example_scalars = vec![ + let example_scalars = [ ScalarValue::Boolean(Some(true)), ScalarValue::Int32(Some(23)), ScalarValue::Float64(Some(12.34)), @@ -135,12 +134,11 @@ mod tests { ScalarValue::Date32(Some(1234)), ]; - Python::with_gil(|py| { + Python::attach(|py| { for scalar in example_scalars.iter() { - let result = ScalarValue::from_pyarrow_bound( - scalar.to_pyarrow(py).unwrap().bind(py), - ) - .unwrap(); + let result = + ScalarValue::from_pyarrow_bound(&scalar.to_pyarrow(py).unwrap()) + .unwrap(); assert_eq!(scalar, &result); } }); @@ -150,7 +148,7 @@ mod tests { fn test_py_scalar() -> PyResult<()> { init_python(); - Python::with_gil(|py| -> PyResult<()> { + Python::attach(|py| -> PyResult<()> { let scalar_float = ScalarValue::Float64(Some(12.34)); let py_float = scalar_float .into_pyobject(py)? diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index a70a027a8fac..fadd2e41eaba 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -171,9 +171,9 @@ pub use struct_builder::ScalarStructBuilder; /// let field_b = Field::new("b", DataType::Utf8, false); /// /// let s1 = ScalarStructBuilder::new() -/// .with_scalar(field_a, ScalarValue::from(1i32)) -/// .with_scalar(field_b, ScalarValue::from("foo")) -/// .build(); +/// .with_scalar(field_a, ScalarValue::from(1i32)) +/// .with_scalar(field_b, ScalarValue::from("foo")) +/// .build(); /// ``` /// /// ## Example: Creating a null [`ScalarValue::Struct`] using [`ScalarStructBuilder`] @@ -199,13 +199,13 @@ pub use struct_builder::ScalarStructBuilder; /// // Build a struct like: {a: 1, b: "foo"} /// // Field description /// let fields = Fields::from(vec![ -/// Field::new("a", DataType::Int32, false), -/// Field::new("b", DataType::Utf8, false), +/// Field::new("a", DataType::Int32, false), +/// Field::new("b", DataType::Utf8, false), /// ]); /// // one row arrays for each field /// let arrays: Vec = vec![ -/// Arc::new(Int32Array::from(vec![1])), -/// Arc::new(StringArray::from(vec!["foo"])), +/// Arc::new(Int32Array::from(vec![1])), +/// Arc::new(StringArray::from(vec!["foo"])), /// ]; /// // no nulls for this array /// let nulls = None; @@ -878,10 +878,10 @@ impl Hash for ScalarValue { fn hash_nested_array(arr: ArrayRef, state: &mut H) { let len = arr.len(); - let arrays = vec![arr]; let hashes_buffer = &mut vec![0; len]; let random_state = ahash::RandomState::with_seeds(0, 0, 0, 0); - let hashes = create_hashes(&arrays, &random_state, hashes_buffer).unwrap(); + let hashes = create_hashes(&[arr], &random_state, hashes_buffer) + .expect("hash_nested_array: failed to create row hashes"); // Hash back to std::hash::Hasher hashes.hash(state); } @@ -1068,8 +1068,8 @@ impl ScalarValue { /// /// Example /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::datatypes::DataType; + /// use datafusion_common::ScalarValue; /// /// let scalar = ScalarValue::try_new_null(&DataType::Int32).unwrap(); /// assert_eq!(scalar.is_null(), true); @@ -1734,7 +1734,7 @@ impl ScalarValue { ) { return _internal_err!("Invalid precision and scale {err}"); } - if *scale <= 0 { + if *scale < 0 { return _internal_err!("Negative scale is not supported"); } match 10_i32.checked_pow((*scale + 1) as u32) { @@ -1750,7 +1750,7 @@ impl ScalarValue { ) { return _internal_err!("Invalid precision and scale {err}"); } - if *scale <= 0 { + if *scale < 0 { return _internal_err!("Negative scale is not supported"); } match i64::from(10).checked_pow((*scale + 1) as u32) { @@ -2231,23 +2231,16 @@ impl ScalarValue { /// /// # Example /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::array::{BooleanArray, Int32Array}; + /// use datafusion_common::ScalarValue; /// /// let arr = Int32Array::from(vec![Some(1), None, Some(10)]); /// let five = ScalarValue::Int32(Some(5)); /// - /// let result = arrow::compute::kernels::cmp::lt( - /// &arr, - /// &five.to_scalar().unwrap(), - /// ).unwrap(); + /// let result = + /// arrow::compute::kernels::cmp::lt(&arr, &five.to_scalar().unwrap()).unwrap(); /// - /// let expected = BooleanArray::from(vec![ - /// Some(true), - /// None, - /// Some(false) - /// ] - /// ); + /// let expected = BooleanArray::from(vec![Some(true), None, Some(false)]); /// /// assert_eq!(&result, &expected); /// ``` @@ -2265,26 +2258,20 @@ impl ScalarValue { /// /// # Example /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::array::{ArrayRef, BooleanArray}; + /// use datafusion_common::ScalarValue; /// /// let scalars = vec![ - /// ScalarValue::Boolean(Some(true)), - /// ScalarValue::Boolean(None), - /// ScalarValue::Boolean(Some(false)), + /// ScalarValue::Boolean(Some(true)), + /// ScalarValue::Boolean(None), + /// ScalarValue::Boolean(Some(false)), /// ]; /// /// // Build an Array from the list of ScalarValues - /// let array = ScalarValue::iter_to_array(scalars.into_iter()) - /// .unwrap(); + /// let array = ScalarValue::iter_to_array(scalars.into_iter()).unwrap(); /// - /// let expected: ArrayRef = std::sync::Arc::new( - /// BooleanArray::from(vec![ - /// Some(true), - /// None, - /// Some(false) - /// ] - /// )); + /// let expected: ArrayRef = + /// std::sync::Arc::new(BooleanArray::from(vec![Some(true), None, Some(false)])); /// /// assert_eq!(&array, &expected); /// ``` @@ -2731,23 +2718,24 @@ impl ScalarValue { /// /// Example /// ``` - /// use datafusion_common::ScalarValue; - /// use arrow::array::{ListArray, Int32Array}; + /// use arrow::array::{Int32Array, ListArray}; /// use arrow::datatypes::{DataType, Int32Type}; /// use datafusion_common::cast::as_list_array; + /// use datafusion_common::ScalarValue; /// /// let scalars = vec![ - /// ScalarValue::Int32(Some(1)), - /// ScalarValue::Int32(None), - /// ScalarValue::Int32(Some(2)) + /// ScalarValue::Int32(Some(1)), + /// ScalarValue::Int32(None), + /// ScalarValue::Int32(Some(2)), /// ]; /// /// let result = ScalarValue::new_list(&scalars, &DataType::Int32, true); /// - /// let expected = ListArray::from_iter_primitive::( - /// vec![ - /// Some(vec![Some(1), None, Some(2)]) - /// ]); + /// let expected = ListArray::from_iter_primitive::(vec![Some(vec![ + /// Some(1), + /// None, + /// Some(2), + /// ])]); /// /// assert_eq!(*result, expected); /// ``` @@ -2791,23 +2779,25 @@ impl ScalarValue { /// /// Example /// ``` - /// use datafusion_common::ScalarValue; - /// use arrow::array::{ListArray, Int32Array}; + /// use arrow::array::{Int32Array, ListArray}; /// use arrow::datatypes::{DataType, Int32Type}; /// use datafusion_common::cast::as_list_array; + /// use datafusion_common::ScalarValue; /// /// let scalars = vec![ - /// ScalarValue::Int32(Some(1)), - /// ScalarValue::Int32(None), - /// ScalarValue::Int32(Some(2)) + /// ScalarValue::Int32(Some(1)), + /// ScalarValue::Int32(None), + /// ScalarValue::Int32(Some(2)), /// ]; /// - /// let result = ScalarValue::new_list_from_iter(scalars.into_iter(), &DataType::Int32, true); + /// let result = + /// ScalarValue::new_list_from_iter(scalars.into_iter(), &DataType::Int32, true); /// - /// let expected = ListArray::from_iter_primitive::( - /// vec![ - /// Some(vec![Some(1), None, Some(2)]) - /// ]); + /// let expected = ListArray::from_iter_primitive::(vec![Some(vec![ + /// Some(1), + /// None, + /// Some(2), + /// ])]); /// /// assert_eq!(*result, expected); /// ``` @@ -2833,23 +2823,25 @@ impl ScalarValue { /// /// Example /// ``` - /// use datafusion_common::ScalarValue; - /// use arrow::array::{LargeListArray, Int32Array}; + /// use arrow::array::{Int32Array, LargeListArray}; /// use arrow::datatypes::{DataType, Int32Type}; /// use datafusion_common::cast::as_large_list_array; + /// use datafusion_common::ScalarValue; /// /// let scalars = vec![ - /// ScalarValue::Int32(Some(1)), - /// ScalarValue::Int32(None), - /// ScalarValue::Int32(Some(2)) + /// ScalarValue::Int32(Some(1)), + /// ScalarValue::Int32(None), + /// ScalarValue::Int32(Some(2)), /// ]; /// /// let result = ScalarValue::new_large_list(&scalars, &DataType::Int32); /// - /// let expected = LargeListArray::from_iter_primitive::( - /// vec![ - /// Some(vec![Some(1), None, Some(2)]) - /// ]); + /// let expected = + /// LargeListArray::from_iter_primitive::(vec![Some(vec![ + /// Some(1), + /// None, + /// Some(2), + /// ])]); /// /// assert_eq!(*result, expected); /// ``` @@ -3248,14 +3240,14 @@ impl ScalarValue { /// /// Example 1: Array (ScalarValue::Int32) /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::array::ListArray; /// use arrow::datatypes::{DataType, Int32Type}; + /// use datafusion_common::ScalarValue; /// /// // Equivalent to [[1,2,3], [4,5]] /// let list_arr = ListArray::from_iter_primitive::(vec![ - /// Some(vec![Some(1), Some(2), Some(3)]), - /// Some(vec![Some(4), Some(5)]) + /// Some(vec![Some(1), Some(2), Some(3)]), + /// Some(vec![Some(4), Some(5)]), /// ]); /// /// // Convert the array into Scalar Values for each row @@ -3278,15 +3270,15 @@ impl ScalarValue { /// /// Example 2: Nested array (ScalarValue::List) /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::array::ListArray; /// use arrow::datatypes::{DataType, Int32Type}; /// use datafusion_common::utils::SingleRowListArrayBuilder; + /// use datafusion_common::ScalarValue; /// use std::sync::Arc; /// /// let list_arr = ListArray::from_iter_primitive::(vec![ - /// Some(vec![Some(1), Some(2), Some(3)]), - /// Some(vec![Some(4), Some(5)]) + /// Some(vec![Some(1), Some(2), Some(3)]), + /// Some(vec![Some(4), Some(5)]), /// ]); /// /// // Wrap into another layer of list, we got nested array as [ [[1,2,3], [4,5]] ] @@ -3295,33 +3287,34 @@ impl ScalarValue { /// // Convert the array into Scalar Values for each row, we got 1D arrays in this example /// let scalar_vec = ScalarValue::convert_array_to_scalar_vec(&list_arr).unwrap(); /// - /// let l1 = ListArray::from_iter_primitive::(vec![ - /// Some(vec![Some(1), Some(2), Some(3)]), - /// ]); - /// let l2 = ListArray::from_iter_primitive::(vec![ - /// Some(vec![Some(4), Some(5)]), - /// ]); + /// let l1 = ListArray::from_iter_primitive::(vec![Some(vec![ + /// Some(1), + /// Some(2), + /// Some(3), + /// ])]); + /// let l2 = ListArray::from_iter_primitive::(vec![Some(vec![ + /// Some(4), + /// Some(5), + /// ])]); /// - /// let expected = vec![ - /// Some(vec![ + /// let expected = vec![Some(vec![ /// ScalarValue::List(Arc::new(l1)), /// ScalarValue::List(Arc::new(l2)), - /// ]), - /// ]; + /// ])]; /// /// assert_eq!(scalar_vec, expected); /// ``` /// /// Example 3: Nullable array /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::array::ListArray; /// use arrow::datatypes::{DataType, Int32Type}; + /// use datafusion_common::ScalarValue; /// /// let list_arr = ListArray::from_iter_primitive::(vec![ - /// Some(vec![Some(1), Some(2), Some(3)]), - /// None, - /// Some(vec![Some(4), Some(5)]) + /// Some(vec![Some(1), Some(2), Some(3)]), + /// None, + /// Some(vec![Some(4), Some(5)]), /// ]); /// /// // Convert the array into Scalar Values for each row @@ -4414,6 +4407,7 @@ macro_rules! impl_scalar { impl_scalar!(f64, Float64); impl_scalar!(f32, Float32); +impl_scalar!(f16, Float16); impl_scalar!(i8, Int8); impl_scalar!(i16, Int16); impl_scalar!(i32, Int32); @@ -4570,6 +4564,7 @@ impl_try_from!(UInt8, u8); impl_try_from!(UInt16, u16); impl_try_from!(UInt32, u32); impl_try_from!(UInt64, u64); +impl_try_from!(Float16, f16); impl_try_from!(Float32, f32); impl_try_from!(Float64, f64); impl_try_from!(Boolean, bool); @@ -4655,9 +4650,9 @@ impl fmt::Display for ScalarValue { } None => write!(f, "NULL")?, }, - ScalarValue::List(arr) => fmt_list(arr.to_owned() as ArrayRef, f)?, - ScalarValue::LargeList(arr) => fmt_list(arr.to_owned() as ArrayRef, f)?, - ScalarValue::FixedSizeList(arr) => fmt_list(arr.to_owned() as ArrayRef, f)?, + ScalarValue::List(arr) => fmt_list(arr.as_ref(), f)?, + ScalarValue::LargeList(arr) => fmt_list(arr.as_ref(), f)?, + ScalarValue::FixedSizeList(arr) => fmt_list(arr.as_ref(), f)?, ScalarValue::Date32(e) => format_option!( f, e.map(|v| { @@ -4779,12 +4774,11 @@ impl fmt::Display for ScalarValue { } } -fn fmt_list(arr: ArrayRef, f: &mut fmt::Formatter) -> fmt::Result { +fn fmt_list(arr: &dyn Array, f: &mut fmt::Formatter) -> fmt::Result { // ScalarValue List, LargeList, FixedSizeList should always have a single element assert_eq!(arr.len(), 1); let options = FormatOptions::default().with_display_error(true); - let formatter = - ArrayFormatter::try_new(arr.as_ref() as &dyn Array, &options).unwrap(); + let formatter = ArrayFormatter::try_new(arr, &options).unwrap(); let value_formatter = formatter.value(0); write!(f, "{value_formatter}") } @@ -8700,7 +8694,7 @@ mod tests { ])), true, )); - let scalars = vec![ + let scalars = [ ScalarValue::try_new_null(&DataType::List(Arc::clone(&field_ref))).unwrap(), ScalarValue::try_new_null(&DataType::LargeList(Arc::clone(&field_ref))) .unwrap(), diff --git a/datafusion/common/src/scalar/struct_builder.rs b/datafusion/common/src/scalar/struct_builder.rs index fd19dccf8963..045b5778243d 100644 --- a/datafusion/common/src/scalar/struct_builder.rs +++ b/datafusion/common/src/scalar/struct_builder.rs @@ -47,13 +47,11 @@ impl ScalarStructBuilder { /// ```rust /// # use arrow::datatypes::{DataType, Field}; /// # use datafusion_common::scalar::ScalarStructBuilder; - /// let fields = vec![ - /// Field::new("a", DataType::Int32, false), - /// ]; + /// let fields = vec![Field::new("a", DataType::Int32, false)]; /// let sv = ScalarStructBuilder::new_null(fields); /// // Note this is `NULL`, not `{a: NULL}` /// assert_eq!(format!("{sv}"), "NULL"); - ///``` + /// ``` /// /// To create a struct where the *fields* are null, use `Self::new()` and /// pass null values for each field: @@ -65,9 +63,9 @@ impl ScalarStructBuilder { /// let field = Field::new("a", DataType::Int32, true); /// // add a null value for the "a" field /// let sv = ScalarStructBuilder::new() - /// .with_scalar(field, ScalarValue::Int32(None)) - /// .build() - /// .unwrap(); + /// .with_scalar(field, ScalarValue::Int32(None)) + /// .build() + /// .unwrap(); /// // value is not null, but field is /// assert_eq!(format!("{sv}"), "{a:}"); /// ``` @@ -85,6 +83,7 @@ impl ScalarStructBuilder { } /// Add the specified field and `ScalarValue` to the struct. + #[expect(clippy::needless_pass_by_value)] // Skip for public API's compatibility pub fn with_scalar(self, field: impl IntoFieldRef, value: ScalarValue) -> Self { // valid scalar value should not fail let array = value.to_array().unwrap(); diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index 2481a88676ef..da298c20ebcb 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -520,33 +520,35 @@ impl Statistics { /// # use arrow::datatypes::{Field, Schema, DataType}; /// # use datafusion_common::stats::Precision; /// let stats1 = Statistics::default() - /// .with_num_rows(Precision::Exact(1)) - /// .with_total_byte_size(Precision::Exact(2)) - /// .add_column_statistics(ColumnStatistics::new_unknown() - /// .with_null_count(Precision::Exact(3)) - /// .with_min_value(Precision::Exact(ScalarValue::from(4))) - /// .with_max_value(Precision::Exact(ScalarValue::from(5))) - /// ); + /// .with_num_rows(Precision::Exact(1)) + /// .with_total_byte_size(Precision::Exact(2)) + /// .add_column_statistics( + /// ColumnStatistics::new_unknown() + /// .with_null_count(Precision::Exact(3)) + /// .with_min_value(Precision::Exact(ScalarValue::from(4))) + /// .with_max_value(Precision::Exact(ScalarValue::from(5))), + /// ); /// /// let stats2 = Statistics::default() - /// .with_num_rows(Precision::Exact(10)) - /// .with_total_byte_size(Precision::Inexact(20)) - /// .add_column_statistics(ColumnStatistics::new_unknown() - /// // absent null count - /// .with_min_value(Precision::Exact(ScalarValue::from(40))) - /// .with_max_value(Precision::Exact(ScalarValue::from(50))) - /// ); + /// .with_num_rows(Precision::Exact(10)) + /// .with_total_byte_size(Precision::Inexact(20)) + /// .add_column_statistics( + /// ColumnStatistics::new_unknown() + /// // absent null count + /// .with_min_value(Precision::Exact(ScalarValue::from(40))) + /// .with_max_value(Precision::Exact(ScalarValue::from(50))), + /// ); /// /// let merged_stats = stats1.try_merge(&stats2).unwrap(); /// let expected_stats = Statistics::default() - /// .with_num_rows(Precision::Exact(11)) - /// .with_total_byte_size(Precision::Inexact(22)) // inexact in stats2 --> inexact - /// .add_column_statistics( - /// ColumnStatistics::new_unknown() - /// .with_null_count(Precision::Absent) // missing from stats2 --> absent - /// .with_min_value(Precision::Exact(ScalarValue::from(4))) - /// .with_max_value(Precision::Exact(ScalarValue::from(50))) - /// ); + /// .with_num_rows(Precision::Exact(11)) + /// .with_total_byte_size(Precision::Inexact(22)) // inexact in stats2 --> inexact + /// .add_column_statistics( + /// ColumnStatistics::new_unknown() + /// .with_null_count(Precision::Absent) // missing from stats2 --> absent + /// .with_min_value(Precision::Exact(ScalarValue::from(4))) + /// .with_max_value(Precision::Exact(ScalarValue::from(50))), + /// ); /// /// assert_eq!(merged_stats, expected_stats) /// ``` diff --git a/datafusion/common/src/table_reference.rs b/datafusion/common/src/table_reference.rs index 574465856760..3163a8b16c8d 100644 --- a/datafusion/common/src/table_reference.rs +++ b/datafusion/common/src/table_reference.rs @@ -69,8 +69,11 @@ impl std::fmt::Display for ResolvedTableReference { /// /// // Get a table reference to 'myschema.mytable' (note the capitalization) /// let table_reference = TableReference::from("MySchema.MyTable"); -/// assert_eq!(table_reference, TableReference::partial("myschema", "mytable")); -///``` +/// assert_eq!( +/// table_reference, +/// TableReference::partial("myschema", "mytable") +/// ); +/// ``` #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub enum TableReference { /// An unqualified table reference, e.g. "table" @@ -247,7 +250,10 @@ impl TableReference { /// assert_eq!(table_reference.to_quoted_string(), "myschema.mytable"); /// /// let table_reference = TableReference::partial("MySchema", "MyTable"); - /// assert_eq!(table_reference.to_quoted_string(), r#""MySchema"."MyTable""#); + /// assert_eq!( + /// table_reference.to_quoted_string(), + /// r#""MySchema"."MyTable""# + /// ); /// ``` pub fn to_quoted_string(&self) -> String { match self { diff --git a/datafusion/common/src/test_util.rs b/datafusion/common/src/test_util.rs index d97d4003e729..c51dea1c4de0 100644 --- a/datafusion/common/src/test_util.rs +++ b/datafusion/common/src/test_util.rs @@ -55,7 +55,7 @@ pub fn format_batches(results: &[RecordBatch]) -> Result i64 { 2 } /// let expr = orig_expr(); /// let ret = Transformed::no(expr.clone()) -/// .transform_data(|expr| { -/// // closure returns a result and potentially transforms the node -/// // in this example, it does transform the node -/// let new_expr = make_new_expr(expr); -/// Ok(Transformed::yes(new_expr)) -/// }).unwrap(); +/// .transform_data(|expr| { +/// // closure returns a result and potentially transforms the node +/// // in this example, it does transform the node +/// let new_expr = make_new_expr(expr); +/// Ok(Transformed::yes(new_expr)) +/// }) +/// .unwrap(); /// // transformed flag is the union of the original ans closure's transformed flag /// assert!(ret.transformed); /// ``` diff --git a/datafusion/common/src/types/builtin.rs b/datafusion/common/src/types/builtin.rs index ec69db790377..314529b99a34 100644 --- a/datafusion/common/src/types/builtin.rs +++ b/datafusion/common/src/types/builtin.rs @@ -15,9 +15,17 @@ // specific language governing permissions and limitations // under the License. +use arrow::datatypes::IntervalUnit::*; + use crate::types::{LogicalTypeRef, NativeType}; use std::sync::{Arc, LazyLock}; +/// Create a singleton and accompanying static variable for a [`LogicalTypeRef`] +/// of a [`NativeType`]. +/// * `name`: name of the static variable, must be unique. +/// * `getter`: name of the public function that will return the singleton instance +/// of the static variable. +/// * `ty`: the [`NativeType`]. macro_rules! singleton { ($name:ident, $getter:ident, $ty:ident) => { static $name: LazyLock = @@ -31,6 +39,26 @@ macro_rules! singleton { }; } +/// Similar to [`singleton`], but for native types that have variants, such as +/// `NativeType::Interval(MonthDayNano)`. +/// * `name`: name of the static variable, must be unique. +/// * `getter`: name of the public function that will return the singleton instance +/// of the static variable. +/// * `ty`: the [`NativeType`]. +/// * `variant`: specific variant of the `ty`. +macro_rules! singleton_variant { + ($name:ident, $getter:ident, $ty:ident, $variant:ident) => { + static $name: LazyLock = + LazyLock::new(|| Arc::new(NativeType::$ty($variant))); + + #[doc = "Getter for singleton instance of a logical type representing"] + #[doc = concat!("[`NativeType::", stringify!($ty), "`] of unit [`", stringify!($variant),"`].`")] + pub fn $getter() -> LogicalTypeRef { + Arc::clone(&$name) + } + }; +} + singleton!(LOGICAL_NULL, logical_null, Null); singleton!(LOGICAL_BOOLEAN, logical_boolean, Boolean); singleton!(LOGICAL_INT8, logical_int8, Int8); @@ -47,3 +75,10 @@ singleton!(LOGICAL_FLOAT64, logical_float64, Float64); singleton!(LOGICAL_DATE, logical_date, Date); singleton!(LOGICAL_BINARY, logical_binary, Binary); singleton!(LOGICAL_STRING, logical_string, String); + +singleton_variant!( + LOGICAL_INTERVAL_MDN, + logical_interval_mdn, + Interval, + MonthDayNano +); diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index eb7cf88e0075..674b1a41204d 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -67,12 +67,12 @@ pub type LogicalTypeRef = Arc; /// &NativeType::String /// } /// -/// fn signature(&self) -> TypeSignature<'_> { -/// TypeSignature::Extension { -/// name: "JSON", -/// parameters: &[], -/// } -/// } +/// fn signature(&self) -> TypeSignature<'_> { +/// TypeSignature::Extension { +/// name: "JSON", +/// parameters: &[], +/// } +/// } /// } /// ``` pub trait LogicalType: Sync + Send { diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 5cef0adfbde8..a1495b779ac9 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -430,22 +430,7 @@ impl From for NativeType { impl NativeType { #[inline] pub fn is_numeric(&self) -> bool { - use NativeType::*; - matches!( - self, - UInt8 - | UInt16 - | UInt32 - | UInt64 - | Int8 - | Int16 - | Int32 - | Int64 - | Float16 - | Float32 - | Float64 - | Decimal(_, _) - ) + self.is_integer() || self.is_float() || self.is_decimal() } #[inline] @@ -486,4 +471,19 @@ impl NativeType { pub fn is_binary(&self) -> bool { matches!(self, NativeType::Binary | NativeType::FixedSizeBinary(_)) } + + #[inline] + pub fn is_null(&self) -> bool { + matches!(self, NativeType::Null) + } + + #[inline] + pub fn is_decimal(&self) -> bool { + matches!(self, Self::Decimal(_, _)) + } + + #[inline] + pub fn is_float(&self) -> bool { + matches!(self, Self::Float16 | Self::Float32 | Self::Float64) + } } diff --git a/datafusion/common/src/utils/memory.rs b/datafusion/common/src/utils/memory.rs index 29e523996cf4..a56b940fab66 100644 --- a/datafusion/common/src/utils/memory.rs +++ b/datafusion/common/src/utils/memory.rs @@ -56,8 +56,8 @@ use std::mem::size_of; /// impl MyStruct { /// fn size(&self) -> Result { /// let num_elements = self.values.len(); -/// let fixed_size = std::mem::size_of_val(self) + -/// std::mem::size_of_val(&self.values); +/// let fixed_size = +/// std::mem::size_of_val(self) + std::mem::size_of_val(&self.values); /// /// estimate_memory_size::(num_elements, fixed_size) /// } @@ -73,8 +73,8 @@ use std::mem::size_of; /// let num_rows = 100; /// let fixed_size = std::mem::size_of::>(); /// let estimated_hashtable_size = -/// estimate_memory_size::<(u64, u64)>(num_rows,fixed_size) -/// .expect("Size estimation failed"); +/// estimate_memory_size::<(u64, u64)>(num_rows, fixed_size) +/// .expect("Size estimation failed"); /// ``` pub fn estimate_memory_size(num_elements: usize, fixed_size: usize) -> Result { // For the majority of cases hashbrown overestimates the bucket quantity diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index 045c02a5a2aa..7b145ac3ae21 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -46,26 +46,23 @@ use std::thread::available_parallelism; /// /// Example: /// ``` -/// use arrow::datatypes::{SchemaRef, Schema, Field, DataType}; +/// use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; /// use datafusion_common::project_schema; /// /// // Schema with columns 'a', 'b', and 'c' /// let schema = SchemaRef::new(Schema::new(vec![ -/// Field::new("a", DataType::Int32, true), -/// Field::new("b", DataType::Int64, true), -/// Field::new("c", DataType::Utf8, true), +/// Field::new("a", DataType::Int32, true), +/// Field::new("b", DataType::Int64, true), +/// Field::new("c", DataType::Utf8, true), /// ])); /// /// // Pick columns 'c' and 'b' -/// let projection = Some(vec![2,1]); -/// let projected_schema = project_schema( -/// &schema, -/// projection.as_ref() -/// ).unwrap(); +/// let projection = Some(vec![2, 1]); +/// let projected_schema = project_schema(&schema, projection.as_ref()).unwrap(); /// /// let expected_schema = SchemaRef::new(Schema::new(vec![ -/// Field::new("c", DataType::Utf8, true), -/// Field::new("b", DataType::Int64, true), +/// Field::new("c", DataType::Utf8, true), +/// Field::new("b", DataType::Int64, true), /// ])); /// /// assert_eq!(projected_schema, expected_schema); @@ -398,9 +395,11 @@ pub fn longest_consecutive_prefix>( /// # use arrow::array::types::Int64Type; /// # use datafusion_common::utils::SingleRowListArrayBuilder; /// // Array is [1, 2, 3] -/// let arr = ListArray::from_iter_primitive::(vec![ -/// Some(vec![Some(1), Some(2), Some(3)]), -/// ]); +/// let arr = ListArray::from_iter_primitive::(vec![Some(vec![ +/// Some(1), +/// Some(2), +/// Some(3), +/// ])]); /// // Wrap as a list array: [[1, 2, 3]] /// let list_arr = SingleRowListArrayBuilder::new(Arc::new(arr)).build_list_array(); /// assert_eq!(list_arr.len(), 1); @@ -554,7 +553,8 @@ pub fn fixed_size_list_to_arrays(a: &ArrayRef) -> Vec { /// use datafusion_common::utils::base_type; /// use std::sync::Arc; /// -/// let data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); +/// let data_type = +/// DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); /// assert_eq!(base_type(&data_type), DataType::Int32); /// /// let data_type = DataType::Int32; @@ -906,16 +906,19 @@ pub fn get_available_parallelism() -> usize { /// # use datafusion_common::utils::take_function_args; /// # use datafusion_common::ScalarValue; /// fn my_function(args: &[ScalarValue]) -> Result<()> { -/// // function expects 2 args, so create a 2-element array -/// let [arg1, arg2] = take_function_args("my_function", args)?; -/// // ... do stuff.. -/// Ok(()) +/// // function expects 2 args, so create a 2-element array +/// let [arg1, arg2] = take_function_args("my_function", args)?; +/// // ... do stuff.. +/// Ok(()) /// } /// /// // Calling the function with 1 argument produces an error: /// let args = vec![ScalarValue::Int32(Some(10))]; /// let err = my_function(&args).unwrap_err(); -/// assert_eq!(err.to_string(), "Execution error: my_function function requires 2 arguments, got 1"); +/// assert_eq!( +/// err.to_string(), +/// "Execution error: my_function function requires 2 arguments, got 1" +/// ); /// // Calling the function with 2 arguments works great /// let args = vec![ScalarValue::Int32(Some(10)), ScalarValue::Int32(Some(20))]; /// my_function(&args).unwrap(); diff --git a/datafusion/common/src/utils/proxy.rs b/datafusion/common/src/utils/proxy.rs index d940677a5fb3..fb951aa3b028 100644 --- a/datafusion/common/src/utils/proxy.rs +++ b/datafusion/common/src/utils/proxy.rs @@ -47,7 +47,9 @@ pub trait VecAllocExt { /// assert_eq!(allocated, 16); // no new allocation needed /// /// // push more data into the vec - /// for _ in 0..10 { vec.push_accounted(1, &mut allocated); } + /// for _ in 0..10 { + /// vec.push_accounted(1, &mut allocated); + /// } /// assert_eq!(allocated, 64); // underlying vec has space for 10 u32s /// assert_eq!(vec.allocated_size(), 64); /// ``` @@ -82,7 +84,9 @@ pub trait VecAllocExt { /// assert_eq!(vec.allocated_size(), 16); // no new allocation needed /// /// // push more data into the vec - /// for _ in 0..10 { vec.push(1); } + /// for _ in 0..10 { + /// vec.push(1); + /// } /// assert_eq!(vec.allocated_size(), 64); // space for 64 now /// ``` fn allocated_size(&self) -> usize; @@ -133,7 +137,9 @@ pub trait RawTableAllocExt { /// assert_eq!(allocated, 64); /// /// // insert more values - /// for i in 0..100 { table.insert_accounted(i, hash_fn, &mut allocated); } + /// for i in 0..100 { + /// table.insert_accounted(i, hash_fn, &mut allocated); + /// } /// assert_eq!(allocated, 400); /// ``` fn insert_accounted( @@ -200,7 +206,9 @@ pub trait HashTableAllocExt { /// assert_eq!(allocated, 64); /// /// // insert more values - /// for i in 0..100 { table.insert_accounted(i, hash_fn, &mut allocated); } + /// for i in 0..100 { + /// table.insert_accounted(i, hash_fn, &mut allocated); + /// } /// assert_eq!(allocated, 400); /// ``` fn insert_accounted( diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 22c9f43a902e..67a73ac6f669 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -32,6 +32,9 @@ rust-version = { workspace = true } [package.metadata.docs.rs] all-features = true +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true @@ -241,6 +244,10 @@ required-features = ["parquet"] harness = false name = "sql_planner" +[[bench]] +harness = false +name = "sql_planner_extended" + [[bench]] harness = false name = "sql_query_with_io" diff --git a/datafusion/core/benches/aggregate_query_sql.rs b/datafusion/core/benches/aggregate_query_sql.rs index 9da341ce2e92..87aeed49337e 100644 --- a/datafusion/core/benches/aggregate_query_sql.rs +++ b/datafusion/core/benches/aggregate_query_sql.rs @@ -21,17 +21,19 @@ extern crate arrow; extern crate datafusion; mod data_utils; + use crate::criterion::Criterion; use data_utils::create_table_provider; use datafusion::error::Result; use datafusion::execution::context::SessionContext; use parking_lot::Mutex; +use std::hint::black_box; use std::sync::Arc; use tokio::runtime::Runtime; fn query(ctx: Arc>, rt: &Runtime, sql: &str) { let df = rt.block_on(ctx.lock().sql(sql)).unwrap(); - criterion::black_box(rt.block_on(df.collect()).unwrap()); + black_box(rt.block_on(df.collect()).unwrap()); } fn create_context( diff --git a/datafusion/core/benches/csv_load.rs b/datafusion/core/benches/csv_load.rs index 3f984757466d..de0f0d825057 100644 --- a/datafusion/core/benches/csv_load.rs +++ b/datafusion/core/benches/csv_load.rs @@ -21,12 +21,14 @@ extern crate arrow; extern crate datafusion; mod data_utils; + use crate::criterion::Criterion; use datafusion::error::Result; use datafusion::execution::context::SessionContext; use datafusion::prelude::CsvReadOptions; use datafusion::test_util::csv::TestCsvFile; use parking_lot::Mutex; +use std::hint::black_box; use std::sync::Arc; use std::time::Duration; use test_utils::AccessLogGenerator; @@ -39,7 +41,7 @@ fn load_csv( options: CsvReadOptions, ) { let df = rt.block_on(ctx.lock().read_csv(path, options)).unwrap(); - criterion::black_box(rt.block_on(df.collect()).unwrap()); + black_box(rt.block_on(df.collect()).unwrap()); } fn create_context() -> Result>> { diff --git a/datafusion/core/benches/dataframe.rs b/datafusion/core/benches/dataframe.rs index 12eb34719e4b..00fa85918347 100644 --- a/datafusion/core/benches/dataframe.rs +++ b/datafusion/core/benches/dataframe.rs @@ -26,6 +26,7 @@ use datafusion::datasource::MemTable; use datafusion::prelude::SessionContext; use datafusion_expr::col; use datafusion_functions::expr_fn::btrim; +use std::hint::black_box; use std::sync::Arc; use tokio::runtime::Runtime; @@ -45,7 +46,7 @@ fn create_context(field_count: u32) -> datafusion_common::Result, rt: &Runtime) { - criterion::black_box(rt.block_on(async { + black_box(rt.block_on(async { let mut data_frame = ctx.table("t").await.unwrap(); for i in 0..column_count { diff --git a/datafusion/core/benches/distinct_query_sql.rs b/datafusion/core/benches/distinct_query_sql.rs index c1ef55992689..d05e8b13b2af 100644 --- a/datafusion/core/benches/distinct_query_sql.rs +++ b/datafusion/core/benches/distinct_query_sql.rs @@ -30,12 +30,13 @@ use datafusion_execution::config::SessionConfig; use datafusion_execution::TaskContext; use parking_lot::Mutex; +use std::hint::black_box; use std::{sync::Arc, time::Duration}; use tokio::runtime::Runtime; fn query(ctx: Arc>, rt: &Runtime, sql: &str) { let df = rt.block_on(ctx.lock().sql(sql)).unwrap(); - criterion::black_box(rt.block_on(df.collect()).unwrap()); + black_box(rt.block_on(df.collect()).unwrap()); } fn create_context( @@ -124,8 +125,7 @@ async fn distinct_with_limit( } fn run(rt: &Runtime, plan: Arc, ctx: Arc) { - criterion::black_box(rt.block_on(distinct_with_limit(plan.clone(), ctx.clone()))) - .unwrap(); + black_box(rt.block_on(distinct_with_limit(plan.clone(), ctx.clone()))).unwrap(); } pub async fn create_context_sampled_data( diff --git a/datafusion/core/benches/filter_query_sql.rs b/datafusion/core/benches/filter_query_sql.rs index c82a1607184d..16905e0f9660 100644 --- a/datafusion/core/benches/filter_query_sql.rs +++ b/datafusion/core/benches/filter_query_sql.rs @@ -24,13 +24,14 @@ use criterion::{criterion_group, criterion_main, Criterion}; use datafusion::prelude::SessionContext; use datafusion::{datasource::MemTable, error::Result}; use futures::executor::block_on; +use std::hint::black_box; use std::sync::Arc; use tokio::runtime::Runtime; async fn query(ctx: &SessionContext, rt: &Runtime, sql: &str) { // execute the query let df = rt.block_on(ctx.sql(sql)).unwrap(); - criterion::black_box(rt.block_on(df.collect()).unwrap()); + black_box(rt.block_on(df.collect()).unwrap()); } fn create_context(array_len: usize, batch_size: usize) -> Result { diff --git a/datafusion/core/benches/map_query_sql.rs b/datafusion/core/benches/map_query_sql.rs index 063b8e6c86bb..09234546b2df 100644 --- a/datafusion/core/benches/map_query_sql.rs +++ b/datafusion/core/benches/map_query_sql.rs @@ -15,10 +15,11 @@ // specific language governing permissions and limitations // under the License. +use std::hint::black_box; use std::sync::Arc; use arrow::array::{ArrayRef, Int32Array, RecordBatch}; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use criterion::{criterion_group, criterion_main, Criterion}; use parking_lot::Mutex; use rand::prelude::ThreadRng; use rand::Rng; diff --git a/datafusion/core/benches/parquet_query_sql.rs b/datafusion/core/benches/parquet_query_sql.rs index 14dcdf15f173..e2b381048013 100644 --- a/datafusion/core/benches/parquet_query_sql.rs +++ b/datafusion/core/benches/parquet_query_sql.rs @@ -166,11 +166,12 @@ fn generate_file() -> NamedTempFile { } let metadata = writer.close().unwrap(); + let file_metadata = metadata.file_metadata(); assert_eq!( - metadata.num_rows as usize, + file_metadata.num_rows() as usize, WRITE_RECORD_BATCH_SIZE * NUM_BATCHES ); - assert_eq!(metadata.row_groups.len(), EXPECTED_ROW_GROUPS); + assert_eq!(metadata.row_groups().len(), EXPECTED_ROW_GROUPS); println!( "Generated parquet file in {} seconds", diff --git a/datafusion/core/benches/spm.rs b/datafusion/core/benches/spm.rs index 5c244832300e..ecc3f908d4b1 100644 --- a/datafusion/core/benches/spm.rs +++ b/datafusion/core/benches/spm.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use std::hint::black_box; use std::sync::Arc; use arrow::array::{ArrayRef, Int32Array, Int64Array, RecordBatch, StringArray}; @@ -25,7 +26,7 @@ use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeE use datafusion_physical_plan::{collect, ExecutionPlan}; use criterion::async_executor::FuturesExecutor; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use criterion::{criterion_group, criterion_main, Criterion}; use datafusion_datasource::memory::MemorySourceConfig; fn generate_spm_for_round_robin_tie_breaker( diff --git a/datafusion/core/benches/sql_planner.rs b/datafusion/core/benches/sql_planner.rs index 83563099cad6..6266a7184cf5 100644 --- a/datafusion/core/benches/sql_planner.rs +++ b/datafusion/core/benches/sql_planner.rs @@ -25,18 +25,12 @@ mod data_utils; use crate::criterion::Criterion; use arrow::array::{ArrayRef, RecordBatch}; use arrow::datatypes::{DataType, Field, Fields, Schema}; -use arrow_schema::TimeUnit::Nanosecond; use criterion::Bencher; use datafusion::datasource::MemTable; use datafusion::execution::context::SessionContext; -use datafusion::prelude::DataFrame; use datafusion_common::{config::Dialect, ScalarValue}; -use datafusion_expr::Expr::Literal; -use datafusion_expr::{cast, col, lit, not, try_cast, when}; -use datafusion_functions::expr_fn::{ - btrim, length, regexp_like, regexp_replace, to_timestamp, upper, -}; -use std::ops::Rem; +use datafusion_expr::col; +use std::hint::black_box; use std::path::PathBuf; use std::sync::Arc; use test_utils::tpcds::tpcds_schemas; @@ -50,12 +44,12 @@ const CLICKBENCH_DATA_PATH: &str = "data/hits_partitioned/"; /// Create a logical plan from the specified sql fn logical_plan(ctx: &SessionContext, rt: &Runtime, sql: &str) { - criterion::black_box(rt.block_on(ctx.sql(sql)).unwrap()); + black_box(rt.block_on(ctx.sql(sql)).unwrap()); } /// Create a physical ExecutionPlan (by way of logical plan) fn physical_plan(ctx: &SessionContext, rt: &Runtime, sql: &str) { - criterion::black_box(rt.block_on(async { + black_box(rt.block_on(async { ctx.sql(sql) .await .unwrap() @@ -65,150 +59,6 @@ fn physical_plan(ctx: &SessionContext, rt: &Runtime, sql: &str) { })); } -/// Build a dataframe for testing logical plan optimization -fn build_test_data_frame(ctx: &SessionContext, rt: &Runtime) -> DataFrame { - register_string_table(ctx, 100, 1000); - - rt.block_on(async { - let mut df = ctx.table("t").await.unwrap(); - // add some columns in - for i in 100..150 { - df = df - .with_column(&format!("c{i}"), Literal(ScalarValue::Utf8(None), None)) - .unwrap(); - } - // add in some columns with string encoded timestamps - for i in 150..175 { - df = df - .with_column( - &format!("c{i}"), - Literal(ScalarValue::Utf8(Some("2025-08-21 09:43:17".into())), None), - ) - .unwrap(); - } - // do a bunch of ops on the columns - for i in 0..175 { - // trim the columns - df = df - .with_column(&format!("c{i}"), btrim(vec![col(format!("c{i}"))])) - .unwrap(); - } - - for i in 0..175 { - let c_name = format!("c{i}"); - let c = col(&c_name); - - // random ops - if i % 5 == 0 && i < 150 { - // the actual ops here are largely unimportant as they are just a sample - // of ops that could occur on a dataframe - df = df - .with_column(&c_name, cast(c.clone(), DataType::Utf8)) - .unwrap() - .with_column( - &c_name, - when( - cast(c.clone(), DataType::Int32).gt(lit(135)), - cast( - cast(c.clone(), DataType::Int32) - lit(i + 3), - DataType::Utf8, - ), - ) - .otherwise(c.clone()) - .unwrap(), - ) - .unwrap() - .with_column( - &c_name, - when( - c.clone().is_not_null().and( - cast(c.clone(), DataType::Int32) - .between(lit(120), lit(130)), - ), - Literal(ScalarValue::Utf8(None), None), - ) - .otherwise( - when( - c.clone().is_not_null().and(regexp_like( - cast(c.clone(), DataType::Utf8View), - lit("[0-9]*"), - None, - )), - upper(c.clone()), - ) - .otherwise(c.clone()) - .unwrap(), - ) - .unwrap(), - ) - .unwrap() - .with_column( - &c_name, - when( - c.clone().is_not_null().and( - cast(c.clone(), DataType::Int32) - .between(lit(90), lit(100)), - ), - cast(c.clone(), DataType::Utf8View), - ) - .otherwise(Literal(ScalarValue::Date32(None), None)) - .unwrap(), - ) - .unwrap() - .with_column( - &c_name, - when( - c.clone().is_not_null().and( - cast(c.clone(), DataType::Int32).rem(lit(10)).gt(lit(7)), - ), - regexp_replace( - cast(c.clone(), DataType::Utf8View), - lit("1"), - lit("a"), - None, - ), - ) - .otherwise(Literal(ScalarValue::Date32(None), None)) - .unwrap(), - ) - .unwrap() - } - if i >= 150 { - df = df - .with_column( - &c_name, - try_cast( - to_timestamp(vec![c.clone(), lit("%Y-%m-%d %H:%M:%S")]), - DataType::Timestamp(Nanosecond, Some("UTC".into())), - ), - ) - .unwrap() - .with_column(&c_name, try_cast(c.clone(), DataType::Date32)) - .unwrap() - } - - // add in a few unions - if i % 30 == 0 { - let df1 = df - .clone() - .filter(length(c.clone()).gt(lit(2))) - .unwrap() - .with_column(&format!("c{i}_filtered"), lit(true)) - .unwrap(); - let df2 = df - .filter(not(length(c.clone()).gt(lit(2)))) - .unwrap() - .with_column(&format!("c{i}_filtered"), lit(false)) - .unwrap(); - - df = df1.union_by_name(df2).unwrap() - } - } - - df - }) -} - /// Create schema with the specified number of columns fn create_schema(column_prefix: &str, num_columns: usize) -> Schema { let fields: Fields = (0..num_columns) @@ -296,7 +146,7 @@ fn benchmark_with_param_values_many_columns( rt.block_on(async { ctx.state().statement_to_plan(statement).await.unwrap() }); b.iter(|| { let plan = plan.clone(); - criterion::black_box(plan.with_param_values(vec![ScalarValue::from(1)]).unwrap()); + black_box(plan.with_param_values(vec![ScalarValue::from(1)]).unwrap()); }); } @@ -334,33 +184,6 @@ fn register_union_order_table(ctx: &SessionContext, num_columns: usize, num_rows ctx.register_table("t", Arc::new(table)).unwrap(); } -/// Registers a table like this: -/// c0,c1,c2...,c99 -/// "0","100"..."9900" -/// "0","200"..."19800" -/// "0","300"..."29700" -fn register_string_table(ctx: &SessionContext, num_columns: usize, num_rows: usize) { - // ("c0", ["0", "0", ...]) - // ("c1": ["100", "200", ...]) - // etc - let iter = (0..num_columns).map(|i| i as u64).map(|i| { - let array: ArrayRef = Arc::new(arrow::array::StringViewArray::from_iter_values( - (0..num_rows) - .map(|j| format!("c{}", j as u64 * 100 + i)) - .collect::>(), - )); - (format!("c{i}"), array) - }); - let batch = RecordBatch::try_from_iter(iter).unwrap(); - let schema = batch.schema(); - let partitions = vec![vec![batch]]; - - // create the table - let table = MemTable::try_new(schema, partitions).unwrap(); - - ctx.register_table("t", Arc::new(table)).unwrap(); -} - /// return a query like /// ```sql /// select c1, 2 as c2, ... n as cn from t ORDER BY c1 @@ -579,7 +402,8 @@ fn criterion_benchmark(c: &mut Criterion) { }); // -- Sorted Queries -- - for column_count in [10, 50, 100, 200, 300] { + // 100, 200 && 300 is taking too long - https://github.com/apache/datafusion/issues/18366 + for column_count in [10, 50 /* 100, 200, 300 */] { register_union_order_table(&ctx, column_count, 1000); // this query has many expressions in its sort order so stresses @@ -596,20 +420,6 @@ fn criterion_benchmark(c: &mut Criterion) { let _ = ctx.deregister_table("t"); } - // -- validate logical plan optimize performance - let df = build_test_data_frame(&ctx, &rt); - - c.bench_function("logical_plan_optimize", |b| { - b.iter(|| { - let df_clone = df.clone(); - criterion::black_box( - rt.block_on(async { df_clone.into_optimized_plan().unwrap() }), - ); - }) - }); - - let _ = ctx.deregister_table("t"); - // --- TPC-H --- let tpch_ctx = register_defs(SessionContext::new(), tpch_schemas()); diff --git a/datafusion/core/benches/sql_planner_extended.rs b/datafusion/core/benches/sql_planner_extended.rs new file mode 100644 index 000000000000..aff7cb4d101d --- /dev/null +++ b/datafusion/core/benches/sql_planner_extended.rs @@ -0,0 +1,233 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, RecordBatch}; +use arrow_schema::DataType; +use arrow_schema::TimeUnit::Nanosecond; +use criterion::{criterion_group, criterion_main, Criterion}; +use datafusion::prelude::{DataFrame, SessionContext}; +use datafusion_catalog::MemTable; +use datafusion_common::ScalarValue; +use datafusion_expr::Expr::Literal; +use datafusion_expr::{cast, col, lit, not, try_cast, when}; +use datafusion_functions::expr_fn::{ + btrim, length, regexp_like, regexp_replace, to_timestamp, upper, +}; +use std::hint::black_box; +use std::ops::Rem; +use std::sync::Arc; +use tokio::runtime::Runtime; + +// This benchmark suite is designed to test the performance of +// logical planning with a large plan containing unions, many columns +// with a variety of operations in it. +// +// Since it is (currently) very slow to execute it has been separated +// out from the sql_planner benchmark suite to this file. +// +// See https://github.com/apache/datafusion/issues/17261 for details. + +/// Registers a table like this: +/// c0,c1,c2...,c99 +/// "0","100"..."9900" +/// "0","200"..."19800" +/// "0","300"..."29700" +fn register_string_table(ctx: &SessionContext, num_columns: usize, num_rows: usize) { + // ("c0", ["0", "0", ...]) + // ("c1": ["100", "200", ...]) + // etc + let iter = (0..num_columns).map(|i| i as u64).map(|i| { + let array: ArrayRef = Arc::new(arrow::array::StringViewArray::from_iter_values( + (0..num_rows) + .map(|j| format!("c{}", j as u64 * 100 + i)) + .collect::>(), + )); + (format!("c{i}"), array) + }); + let batch = RecordBatch::try_from_iter(iter).unwrap(); + let schema = batch.schema(); + let partitions = vec![vec![batch]]; + + // create the table + let table = MemTable::try_new(schema, partitions).unwrap(); + + ctx.register_table("t", Arc::new(table)).unwrap(); +} + +/// Build a dataframe for testing logical plan optimization +fn build_test_data_frame(ctx: &SessionContext, rt: &Runtime) -> DataFrame { + register_string_table(ctx, 100, 1000); + + rt.block_on(async { + let mut df = ctx.table("t").await.unwrap(); + // add some columns in + for i in 100..150 { + df = df + .with_column(&format!("c{i}"), Literal(ScalarValue::Utf8(None), None)) + .unwrap(); + } + // add in some columns with string encoded timestamps + for i in 150..175 { + df = df + .with_column( + &format!("c{i}"), + Literal(ScalarValue::Utf8(Some("2025-08-21 09:43:17".into())), None), + ) + .unwrap(); + } + // do a bunch of ops on the columns + for i in 0..175 { + // trim the columns + df = df + .with_column(&format!("c{i}"), btrim(vec![col(format!("c{i}"))])) + .unwrap(); + } + + for i in 0..175 { + let c_name = format!("c{i}"); + let c = col(&c_name); + + // random ops + if i % 5 == 0 && i < 150 { + // the actual ops here are largely unimportant as they are just a sample + // of ops that could occur on a dataframe + df = df + .with_column(&c_name, cast(c.clone(), DataType::Utf8)) + .unwrap() + .with_column( + &c_name, + when( + cast(c.clone(), DataType::Int32).gt(lit(135)), + cast( + cast(c.clone(), DataType::Int32) - lit(i + 3), + DataType::Utf8, + ), + ) + .otherwise(c.clone()) + .unwrap(), + ) + .unwrap() + .with_column( + &c_name, + when( + c.clone().is_not_null().and( + cast(c.clone(), DataType::Int32) + .between(lit(120), lit(130)), + ), + Literal(ScalarValue::Utf8(None), None), + ) + .otherwise( + when( + c.clone().is_not_null().and(regexp_like( + cast(c.clone(), DataType::Utf8View), + lit("[0-9]*"), + None, + )), + upper(c.clone()), + ) + .otherwise(c.clone()) + .unwrap(), + ) + .unwrap(), + ) + .unwrap() + .with_column( + &c_name, + when( + c.clone().is_not_null().and( + cast(c.clone(), DataType::Int32) + .between(lit(90), lit(100)), + ), + cast(c.clone(), DataType::Utf8View), + ) + .otherwise(Literal(ScalarValue::Date32(None), None)) + .unwrap(), + ) + .unwrap() + .with_column( + &c_name, + when( + c.clone().is_not_null().and( + cast(c.clone(), DataType::Int32).rem(lit(10)).gt(lit(7)), + ), + regexp_replace( + cast(c.clone(), DataType::Utf8View), + lit("1"), + lit("a"), + None, + ), + ) + .otherwise(Literal(ScalarValue::Date32(None), None)) + .unwrap(), + ) + .unwrap() + } + if i >= 150 { + df = df + .with_column( + &c_name, + try_cast( + to_timestamp(vec![c.clone(), lit("%Y-%m-%d %H:%M:%S")]), + DataType::Timestamp(Nanosecond, Some("UTC".into())), + ), + ) + .unwrap() + .with_column(&c_name, try_cast(c.clone(), DataType::Date32)) + .unwrap() + } + + // add in a few unions + if i % 30 == 0 { + let df1 = df + .clone() + .filter(length(c.clone()).gt(lit(2))) + .unwrap() + .with_column(&format!("c{i}_filtered"), lit(true)) + .unwrap(); + let df2 = df + .filter(not(length(c.clone()).gt(lit(2)))) + .unwrap() + .with_column(&format!("c{i}_filtered"), lit(false)) + .unwrap(); + + df = df1.union_by_name(df2).unwrap() + } + } + + df + }) +} + +fn criterion_benchmark(c: &mut Criterion) { + let ctx = SessionContext::new(); + let rt = Runtime::new().unwrap(); + + // validate logical plan optimize performance + // https://github.com/apache/datafusion/issues/17261 + + let df = build_test_data_frame(&ctx, &rt); + + c.bench_function("logical_plan_optimize", |b| { + b.iter(|| { + let df_clone = df.clone(); + black_box(rt.block_on(async { df_clone.into_optimized_plan().unwrap() })); + }) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/core/benches/struct_query_sql.rs b/datafusion/core/benches/struct_query_sql.rs index f9cc43d1ea2c..5c7b42731082 100644 --- a/datafusion/core/benches/struct_query_sql.rs +++ b/datafusion/core/benches/struct_query_sql.rs @@ -24,13 +24,14 @@ use criterion::{criterion_group, criterion_main, Criterion}; use datafusion::prelude::SessionContext; use datafusion::{datasource::MemTable, error::Result}; use futures::executor::block_on; +use std::hint::black_box; use std::sync::Arc; use tokio::runtime::Runtime; async fn query(ctx: &SessionContext, rt: &Runtime, sql: &str) { // execute the query let df = rt.block_on(ctx.sql(sql)).unwrap(); - criterion::black_box(rt.block_on(df.collect()).unwrap()); + black_box(rt.block_on(df.collect()).unwrap()); } fn create_context(array_len: usize, batch_size: usize) -> Result { diff --git a/datafusion/core/benches/topk_aggregate.rs b/datafusion/core/benches/topk_aggregate.rs index cf3c7fa2e26f..7971293c9ce2 100644 --- a/datafusion/core/benches/topk_aggregate.rs +++ b/datafusion/core/benches/topk_aggregate.rs @@ -16,6 +16,7 @@ // under the License. mod data_utils; + use arrow::util::pretty::pretty_format_batches; use criterion::{criterion_group, criterion_main, Criterion}; use data_utils::make_data; @@ -24,6 +25,7 @@ use datafusion::prelude::SessionContext; use datafusion::{datasource::MemTable, error::Result}; use datafusion_execution::config::SessionConfig; use datafusion_execution::TaskContext; +use std::hint::black_box; use std::sync::Arc; use tokio::runtime::Runtime; @@ -44,7 +46,7 @@ async fn create_context( opts.optimizer.enable_topk_aggregation = use_topk; let ctx = SessionContext::new_with_config(cfg); let _ = ctx.register_table("traces", mem_table)?; - let sql = format!("select trace_id, max(timestamp_ms) from traces group by trace_id order by max(timestamp_ms) desc limit {limit};"); + let sql = format!("select max(timestamp_ms) from traces group by trace_id order by max(timestamp_ms) desc limit {limit};"); let df = ctx.sql(sql.as_str()).await?; let physical_plan = df.create_physical_plan().await?; let actual_phys_plan = displayable(physical_plan.as_ref()).indent(true).to_string(); @@ -57,10 +59,8 @@ async fn create_context( } fn run(rt: &Runtime, plan: Arc, ctx: Arc, asc: bool) { - criterion::black_box( - rt.block_on(async { aggregate(plan.clone(), ctx.clone(), asc).await }), - ) - .unwrap(); + black_box(rt.block_on(async { aggregate(plan.clone(), ctx.clone(), asc).await })) + .unwrap(); } async fn aggregate( @@ -75,20 +75,20 @@ async fn aggregate( let actual = format!("{}", pretty_format_batches(&batches)?).to_lowercase(); let expected_asc = r#" -+----------------------------------+--------------------------+ -| trace_id | max(traces.timestamp_ms) | -+----------------------------------+--------------------------+ -| 5868861a23ed31355efc5200eb80fe74 | 16909009999999 | -| 4040e64656804c3d77320d7a0e7eb1f0 | 16909009999998 | -| 02801bbe533190a9f8713d75222f445d | 16909009999997 | -| 9e31b3b5a620de32b68fefa5aeea57f1 | 16909009999996 | -| 2d88a860e9bd1cfaa632d8e7caeaa934 | 16909009999995 | -| a47edcef8364ab6f191dd9103e51c171 | 16909009999994 | -| 36a3fa2ccfbf8e00337f0b1254384db6 | 16909009999993 | -| 0756be84f57369012e10de18b57d8a2f | 16909009999992 | -| d4d6bf9845fa5897710e3a8db81d5907 | 16909009999991 | -| 3c2cc1abe728a66b61e14880b53482a0 | 16909009999990 | -+----------------------------------+--------------------------+ ++--------------------------+ +| max(traces.timestamp_ms) | ++--------------------------+ +| 16909009999999 | +| 16909009999998 | +| 16909009999997 | +| 16909009999996 | +| 16909009999995 | +| 16909009999994 | +| 16909009999993 | +| 16909009999992 | +| 16909009999991 | +| 16909009999990 | ++--------------------------+ "# .trim(); if asc { diff --git a/datafusion/core/benches/window_query_sql.rs b/datafusion/core/benches/window_query_sql.rs index a55d17a7c5dc..6d83959f7eb3 100644 --- a/datafusion/core/benches/window_query_sql.rs +++ b/datafusion/core/benches/window_query_sql.rs @@ -21,17 +21,19 @@ extern crate arrow; extern crate datafusion; mod data_utils; + use crate::criterion::Criterion; use data_utils::create_table_provider; use datafusion::error::Result; use datafusion::execution::context::SessionContext; use parking_lot::Mutex; +use std::hint::black_box; use std::sync::Arc; use tokio::runtime::Runtime; fn query(ctx: Arc>, rt: &Runtime, sql: &str) { let df = rt.block_on(ctx.lock().sql(sql)).unwrap(); - criterion::black_box(rt.block_on(df.collect()).unwrap()); + black_box(rt.block_on(df.collect()).unwrap()); } fn create_context( diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 287a133273d8..98804e424b40 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -258,10 +258,13 @@ impl DataFrame { /// # async fn main() -> Result<()> { /// // datafusion will parse number as i64 first. /// let sql = "a > 1 and b in (1, 10)"; - /// let expected = col("a").gt(lit(1 as i64)) - /// .and(col("b").in_list(vec![lit(1 as i64), lit(10 as i64)], false)); + /// let expected = col("a") + /// .gt(lit(1 as i64)) + /// .and(col("b").in_list(vec![lit(1 as i64), lit(10 as i64)], false)); /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let expr = df.parse_sql_expr(sql)?; /// assert_eq!(expected, expr); /// # Ok(()) @@ -289,14 +292,16 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.select_columns(&["a", "b"])?; /// let expected = vec![ /// "+---+---+", /// "| a | b |", /// "+---+---+", /// "| 1 | 2 |", - /// "+---+---+" + /// "+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -305,12 +310,12 @@ impl DataFrame { pub fn select_columns(self, columns: &[&str]) -> Result { let fields = columns .iter() - .map(|name| { + .flat_map(|name| { self.plan .schema() - .qualified_field_with_unqualified_name(name) + .qualified_fields_with_unqualified_name(name) }) - .collect::>>()?; + .collect::>(); let expr: Vec = fields .into_iter() .map(|(qualifier, field)| Expr::Column(Column::from((qualifier, field)))) @@ -329,8 +334,10 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let df : DataFrame = df.select_exprs(&["a * b", "c"])?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let df: DataFrame = df.select_exprs(&["a * b", "c"])?; /// # Ok(()) /// # } /// ``` @@ -357,14 +364,16 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.select(vec![col("a"), col("b") * col("c")])?; /// let expected = vec![ /// "+---+-----------------------+", /// "| a | ?table?.b * ?table?.c |", /// "+---+-----------------------+", /// "| 1 | 6 |", - /// "+---+-----------------------+" + /// "+---+-----------------------+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -407,7 +416,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// // +----+----+----+ /// // | a | b | c | /// // +----+----+----+ @@ -419,7 +430,7 @@ impl DataFrame { /// "| b | c |", /// "+---+---+", /// "| 2 | 3 |", - /// "+---+---+" + /// "+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -428,13 +439,12 @@ impl DataFrame { pub fn drop_columns(self, columns: &[&str]) -> Result { let fields_to_drop = columns .iter() - .map(|name| { + .flat_map(|name| { self.plan .schema() - .qualified_field_with_unqualified_name(name) + .qualified_fields_with_unqualified_name(name) }) - .filter(|r| r.is_ok()) - .collect::>>()?; + .collect::>(); let expr: Vec = self .plan .schema() @@ -518,7 +528,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.filter(col("a").lt_eq(col("b")))?; /// // all rows where a <= b are returned /// let expected = vec![ @@ -528,7 +540,7 @@ impl DataFrame { /// "| 1 | 2 | 3 |", /// "| 4 | 5 | 6 |", /// "| 7 | 8 | 9 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -557,7 +569,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// /// // The following use is the equivalent of "SELECT MIN(b) GROUP BY a" /// let df1 = df.clone().aggregate(vec![col("a")], vec![min(col("b"))])?; @@ -568,7 +582,7 @@ impl DataFrame { /// "| 1 | 2 |", /// "| 4 | 5 |", /// "| 7 | 8 |", - /// "+---+----------------+" + /// "+---+----------------+", /// ]; /// assert_batches_sorted_eq!(expected1, &df1.collect().await?); /// // The following use is the equivalent of "SELECT MIN(b)" @@ -578,7 +592,7 @@ impl DataFrame { /// "| min(?table?.b) |", /// "+----------------+", /// "| 2 |", - /// "+----------------+" + /// "+----------------+", /// ]; /// # assert_batches_sorted_eq!(expected2, &df2.collect().await?); /// # Ok(()) @@ -646,7 +660,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.limit(1, Some(2))?; /// let expected = vec![ /// "+---+---+---+", @@ -654,7 +670,7 @@ impl DataFrame { /// "+---+---+---+", /// "| 4 | 5 | 6 |", /// "| 7 | 8 | 9 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -683,7 +699,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? ; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let d2 = df.clone(); /// let df = df.union(d2)?; /// let expected = vec![ @@ -692,7 +710,7 @@ impl DataFrame { /// "+---+---+---+", /// "| 1 | 2 | 3 |", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -723,8 +741,13 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let d2 = df.clone().select_columns(&["b", "c", "a"])?.with_column("d", lit("77"))?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let d2 = df + /// .clone() + /// .select_columns(&["b", "c", "a"])? + /// .with_column("d", lit("77"))?; /// let df = df.union_by_name(d2)?; /// let expected = vec![ /// "+---+---+---+----+", @@ -732,7 +755,7 @@ impl DataFrame { /// "+---+---+---+----+", /// "| 1 | 2 | 3 | |", /// "| 1 | 2 | 3 | 77 |", - /// "+---+---+---+----+" + /// "+---+---+---+----+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -762,7 +785,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let d2 = df.clone(); /// let df = df.union_distinct(d2)?; /// // df2 are duplicate of df @@ -771,7 +796,7 @@ impl DataFrame { /// "| a | b | c |", /// "+---+---+---+", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -802,7 +827,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let d2 = df.clone().select_columns(&["b", "c", "a"])?; /// let df = df.union_by_name_distinct(d2)?; /// let expected = vec![ @@ -810,7 +837,7 @@ impl DataFrame { /// "| a | b | c |", /// "+---+---+---+", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -837,14 +864,16 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.distinct()?; /// let expected = vec![ /// "+---+---+---+", /// "| a | b | c |", /// "+---+---+---+", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -871,15 +900,17 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? - /// // Return a single row (a, b) for each distinct value of a - /// .distinct_on(vec![col("a")], vec![col("a"), col("b")], None)?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await? + /// // Return a single row (a, b) for each distinct value of a + /// .distinct_on(vec![col("a")], vec![col("a"), col("b")], None)?; /// let expected = vec![ /// "+---+---+", /// "| a | b |", /// "+---+---+", /// "| 1 | 2 |", - /// "+---+---+" + /// "+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -952,7 +983,7 @@ impl DataFrame { })); //collect recordBatch - let describe_record_batch = vec![ + let describe_record_batch = [ // count aggregation self.clone().aggregate( vec![], @@ -1125,11 +1156,13 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.sort(vec![ - /// col("a").sort(false, true), // a DESC, nulls first - /// col("b").sort(true, false), // b ASC, nulls last - /// ])?; + /// col("a").sort(false, true), // a DESC, nulls first + /// col("b").sort(true, false), // b ASC, nulls last + /// ])?; /// let expected = vec![ /// "+---+---+---+", /// "| a | b | c |", @@ -1176,12 +1209,17 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let left = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let right = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? - /// .select(vec![ - /// col("a").alias("a2"), - /// col("b").alias("b2"), - /// col("c").alias("c2")])?; + /// let left = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let right = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await? + /// .select(vec![ + /// col("a").alias("a2"), + /// col("b").alias("b2"), + /// col("c").alias("c2"), + /// ])?; /// // Perform the equivalent of `left INNER JOIN right ON (a = a2 AND b = b2)` /// // finding all pairs of rows from `left` and `right` where `a = a2` and `b = b2`. /// let join = left.join(right, JoinType::Inner, &["a", "b"], &["a2", "b2"], None)?; @@ -1190,13 +1228,12 @@ impl DataFrame { /// "| a | b | c | a2 | b2 | c2 |", /// "+---+---+---+----+----+----+", /// "| 1 | 2 | 3 | 1 | 2 | 3 |", - /// "+---+---+---+----+----+----+" + /// "+---+---+---+----+----+----+", /// ]; /// assert_batches_sorted_eq!(expected, &join.collect().await?); /// # Ok(()) /// # } /// ``` - /// pub fn join( self, right: DataFrame, @@ -1258,7 +1295,7 @@ impl DataFrame { /// "+---+---+---+----+----+----+", /// "| a | b | c | a2 | b2 | c2 |", /// "+---+---+---+----+----+----+", - /// "+---+---+---+----+----+----+" + /// "+---+---+---+----+----+----+", /// ]; /// # assert_batches_sorted_eq!(expected, &join_on.collect().await?); /// # Ok(()) @@ -1290,7 +1327,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df1 = df.repartition(Partitioning::RoundRobinBatch(4))?; /// let expected = vec![ /// "+---+---+---+", @@ -1299,7 +1338,7 @@ impl DataFrame { /// "| 1 | 2 | 3 |", /// "| 4 | 5 | 6 |", /// "| 7 | 8 | 9 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df1.collect().await?); /// # Ok(()) @@ -1328,7 +1367,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let count = df.count().await?; // 1 /// # assert_eq!(count, 1); /// # Ok(()) @@ -1367,7 +1408,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let batches = df.collect().await?; /// # Ok(()) /// # } @@ -1387,7 +1430,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// df.show().await?; /// # Ok(()) /// # } @@ -1446,7 +1491,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// df.show_limit(10).await?; /// # Ok(()) /// # } @@ -1472,7 +1519,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let stream = df.execute_stream().await?; /// # Ok(()) /// # } @@ -1498,7 +1547,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let batches = df.collect_partitioned().await?; /// # Ok(()) /// # } @@ -1518,7 +1569,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let batches = df.execute_stream_partitioned().await?; /// # Ok(()) /// # } @@ -1547,7 +1600,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let schema = df.schema(); /// # Ok(()) /// # } @@ -1598,7 +1653,19 @@ impl DataFrame { /// Note: This discards the [`SessionState`] associated with this /// [`DataFrame`] in favour of the one passed to [`TableProvider::scan`] pub fn into_view(self) -> Arc { - Arc::new(DataFrameTableProvider { plan: self.plan }) + Arc::new(DataFrameTableProvider { + plan: self.plan, + table_type: TableType::Temporary, + }) + } + + /// See [`Self::into_view`]. The returned [`TableProvider`] will + /// create a transient table. + pub fn into_temporary_view(self) -> Arc { + Arc::new(DataFrameTableProvider { + plan: self.plan, + table_type: TableType::Temporary, + }) } /// Return a DataFrame with the explanation of its plan so far. @@ -1613,8 +1680,14 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let batches = df.limit(0, Some(100))?.explain(false, false)?.collect().await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let batches = df + /// .limit(0, Some(100))? + /// .explain(false, false)? + /// .collect() + /// .await?; /// # Ok(()) /// # } /// ``` @@ -1637,8 +1710,18 @@ impl DataFrame { /// # async fn main() -> Result<()> { /// use datafusion_expr::{Explain, ExplainOption}; /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let batches = df.limit(0, Some(100))?.explain_with_options(ExplainOption::default().with_verbose(false).with_analyze(false))?.collect().await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let batches = df + /// .limit(0, Some(100))? + /// .explain_with_options( + /// ExplainOption::default() + /// .with_verbose(false) + /// .with_analyze(false), + /// )? + /// .collect() + /// .await?; /// # Ok(()) /// # } /// ``` @@ -1668,7 +1751,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let f = df.registry(); /// // use f.udf("name", vec![...]) to use the udf /// # Ok(()) @@ -1687,15 +1772,19 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let d2 = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let d2 = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.intersect(d2)?; /// let expected = vec![ /// "+---+---+---+", /// "| a | b | c |", /// "+---+---+---+", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -1721,15 +1810,19 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let d2 = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let d2 = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.intersect_distinct(d2)?; /// let expected = vec![ /// "+---+---+---+", /// "| a | b | c |", /// "+---+---+---+", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -1755,8 +1848,12 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; - /// let d2 = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; + /// let d2 = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let result = df.except(d2)?; /// // those columns are not in example.csv, but in example_long.csv /// let expected = vec![ @@ -1765,7 +1862,7 @@ impl DataFrame { /// "+---+---+---+", /// "| 4 | 5 | 6 |", /// "| 7 | 8 | 9 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &result.collect().await?); /// # Ok(()) @@ -1791,8 +1888,12 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; - /// let d2 = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; + /// let d2 = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let result = df.except_distinct(d2)?; /// // those columns are not in example.csv, but in example_long.csv /// let expected = vec![ @@ -1801,7 +1902,7 @@ impl DataFrame { /// "+---+---+---+", /// "| 4 | 5 | 6 |", /// "| 7 | 8 | 9 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &result.collect().await?); /// # Ok(()) @@ -1878,13 +1979,15 @@ impl DataFrame { /// use datafusion::dataframe::DataFrameWriteOptions; /// let ctx = SessionContext::new(); /// // Sort the data by column "b" and write it to a new location - /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? - /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first - /// .write_csv( - /// "output.csv", - /// DataFrameWriteOptions::new(), - /// None, // can also specify CSV writing options here - /// ).await?; + /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await? + /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first + /// .write_csv( + /// "output.csv", + /// DataFrameWriteOptions::new(), + /// None, // can also specify CSV writing options here + /// ) + /// .await?; /// # fs::remove_file("output.csv")?; /// # Ok(()) /// # } @@ -1948,13 +2051,11 @@ impl DataFrame { /// use datafusion::dataframe::DataFrameWriteOptions; /// let ctx = SessionContext::new(); /// // Sort the data by column "b" and write it to a new location - /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? - /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first - /// .write_json( - /// "output.json", - /// DataFrameWriteOptions::new(), - /// None - /// ).await?; + /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await? + /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first + /// .write_json("output.json", DataFrameWriteOptions::new(), None) + /// .await?; /// # fs::remove_file("output.json")?; /// # Ok(()) /// # } @@ -2015,7 +2116,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.with_column("ab_sum", col("a") + col("b"))?; /// # Ok(()) /// # } @@ -2089,7 +2192,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.with_column_renamed("ab_sum", "total")?; /// /// # Ok(()) @@ -2222,7 +2327,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.cache().await?; /// # Ok(()) /// # } @@ -2266,7 +2373,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// // Fill nulls in only columns "a" and "c": /// let df = df.fill_null(ScalarValue::from(0), vec!["a".to_owned(), "c".to_owned()])?; /// // Fill nulls across all columns: @@ -2337,9 +2446,9 @@ impl DataFrame { /// Helper for creating DataFrame. /// # Example /// ``` - /// use std::sync::Arc; /// use arrow::array::{ArrayRef, Int32Array, StringArray}; /// use datafusion::prelude::DataFrame; + /// use std::sync::Arc; /// let id: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); /// let name: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz"])); /// let df = DataFrame::from_columns(vec![("id", id), ("name", name)]).unwrap(); @@ -2426,6 +2535,7 @@ macro_rules! dataframe { #[derive(Debug)] struct DataFrameTableProvider { plan: LogicalPlan, + table_type: TableType, } #[async_trait] @@ -2451,7 +2561,7 @@ impl TableProvider for DataFrameTableProvider { } fn table_type(&self) -> TableType { - TableType::View + self.table_type } async fn scan( diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs index d46a902ca513..cb8a6cf29541 100644 --- a/datafusion/core/src/dataframe/parquet.rs +++ b/datafusion/core/src/dataframe/parquet.rs @@ -42,13 +42,15 @@ impl DataFrame { /// use datafusion::dataframe::DataFrameWriteOptions; /// let ctx = SessionContext::new(); /// // Sort the data by column "b" and write it to a new location - /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? - /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first - /// .write_parquet( - /// "output.parquet", - /// DataFrameWriteOptions::new(), - /// None, // can also specify parquet writing options here - /// ).await?; + /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await? + /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first + /// .write_parquet( + /// "output.parquet", + /// DataFrameWriteOptions::new(), + /// None, // can also specify parquet writing options here + /// ) + /// .await?; /// # fs::remove_file("output.parquet")?; /// # Ok(()) /// # } @@ -116,6 +118,8 @@ mod tests { use datafusion_execution::config::SessionConfig; use datafusion_expr::{col, lit}; + #[cfg(feature = "parquet_encryption")] + use datafusion_common::config::ConfigFileEncryptionProperties; use object_store::local::LocalFileSystem; use parquet::file::reader::FileReader; use tempfile::TempDir; @@ -280,7 +284,8 @@ mod tests { // Write encrypted parquet using write_parquet let mut options = TableParquetOptions::default(); - options.crypto.file_encryption = Some((&encrypt).into()); + options.crypto.file_encryption = + Some(ConfigFileEncryptionProperties::from(&encrypt)); options.global.allow_single_file_parallelism = allow_single_file_parallelism; df.write_parquet( diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index e165707c2eb0..7c55d452c4e1 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -40,6 +40,7 @@ pub(crate) mod test_util { use datafusion_catalog::Session; use datafusion_common::Result; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; + use datafusion_datasource::TableSchema; use datafusion_datasource::{file_format::FileFormat, PartitionedFile}; use datafusion_execution::object_store::ObjectStoreUrl; use std::sync::Arc; @@ -66,6 +67,8 @@ pub(crate) mod test_util { .await? }; + let table_schema = TableSchema::new(file_schema.clone(), vec![]); + let statistics = format .infer_stats(state, &store, file_schema.clone(), &meta) .await?; @@ -85,12 +88,11 @@ pub(crate) mod test_util { state, FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), - file_schema, - format.file_source(), + format.file_source(table_schema), ) .with_file_groups(file_groups) .with_statistics(statistics) - .with_projection(projection) + .with_projection_indices(projection) .with_limit(limit) .build(), ) diff --git a/datafusion/core/src/datasource/file_format/options.rs b/datafusion/core/src/datasource/file_format/options.rs index 8c1bb02ef073..e78c5f09553c 100644 --- a/datafusion/core/src/datasource/file_format/options.rs +++ b/datafusion/core/src/datasource/file_format/options.rs @@ -269,6 +269,8 @@ pub struct ParquetReadOptions<'a> { pub file_sort_order: Vec>, /// Properties for decryption of Parquet files that use modular encryption pub file_decryption_properties: Option, + /// Metadata size hint for Parquet files reading (in bytes) + pub metadata_size_hint: Option, } impl Default for ParquetReadOptions<'_> { @@ -281,6 +283,7 @@ impl Default for ParquetReadOptions<'_> { schema: None, file_sort_order: vec![], file_decryption_properties: None, + metadata_size_hint: None, } } } @@ -340,6 +343,12 @@ impl<'a> ParquetReadOptions<'a> { self.file_decryption_properties = Some(file_decryption_properties); self } + + /// Configure metadata size hint for Parquet files reading (in bytes) + pub fn metadata_size_hint(mut self, size_hint: Option) -> Self { + self.metadata_size_hint = size_hint; + self + } } /// Options that control the reading of ARROW files. @@ -606,6 +615,11 @@ impl ReadOptions<'_> for ParquetReadOptions<'_> { if let Some(file_decryption_properties) = &self.file_decryption_properties { options.crypto.file_decryption = Some(file_decryption_properties.clone()); } + // This can be overridden per-read in ParquetReadOptions, if setting. + if let Some(metadata_size_hint) = self.metadata_size_hint { + options.global.metadata_size_hint = Some(metadata_size_hint); + } + let mut file_format = ParquetFormat::new().with_options(options); if let Some(parquet_pruning) = self.parquet_pruning { diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 088c4408fff5..52c5393e1031 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -154,7 +154,6 @@ mod tests { use futures::stream::BoxStream; use futures::StreamExt; use insta::assert_snapshot; - use log::error; use object_store::local::LocalFileSystem; use object_store::ObjectMeta; use object_store::{ @@ -163,9 +162,10 @@ mod tests { }; use parquet::arrow::arrow_reader::ArrowReaderOptions; use parquet::arrow::ParquetRecordBatchStreamBuilder; - use parquet::file::metadata::{KeyValue, ParquetColumnIndex, ParquetOffsetIndex}; - use parquet::file::page_index::index::Index; - use parquet::format::FileMetaData; + use parquet::file::metadata::{ + KeyValue, ParquetColumnIndex, ParquetMetaData, ParquetOffsetIndex, + }; + use parquet::file::page_index::column_index::ColumnIndexMetaData; use tokio::fs::File; enum ForceViews { @@ -546,7 +546,8 @@ mod tests { let (files, _file_names) = store_parquet(vec![batch1], false).await?; let state = SessionContext::new().state(); - let format = ParquetFormat::default(); + // Make metadata size hint None to keep original behavior + let format = ParquetFormat::default().with_metadata_size_hint(None); let _schema = format.infer_schema(&state, &store.upcast(), &files).await?; assert_eq!(store.request_count(), 3); // No increase, cache being used. @@ -620,7 +621,9 @@ mod tests { let mut state = SessionContext::new().state(); state = set_view_state(state, force_views); - let format = ParquetFormat::default().with_force_view_types(force_views); + let format = ParquetFormat::default() + .with_force_view_types(force_views) + .with_metadata_size_hint(None); let schema = format.infer_schema(&state, &store.upcast(), &files).await?; assert_eq!(store.request_count(), 6); @@ -1144,18 +1147,14 @@ mod tests { // 325 pages in int_col assert_eq!(int_col_offset.len(), 325); - match int_col_index { - Index::INT32(index) => { - assert_eq!(index.indexes.len(), 325); - for min_max in index.clone().indexes { - assert!(min_max.min.is_some()); - assert!(min_max.max.is_some()); - assert!(min_max.null_count.is_some()); - } - } - _ => { - error!("fail to read page index.") - } + let ColumnIndexMetaData::INT32(index) = int_col_index else { + panic!("fail to read page index.") + }; + assert_eq!(index.min_values().len(), 325); + assert_eq!(index.max_values().len(), 325); + // all values are non null + for idx in 0..325 { + assert_eq!(index.null_count(idx), Some(0)); } } @@ -1556,7 +1555,7 @@ mod tests { Ok(parquet_sink) } - fn get_written(parquet_sink: Arc) -> Result<(Path, FileMetaData)> { + fn get_written(parquet_sink: Arc) -> Result<(Path, ParquetMetaData)> { let mut written = parquet_sink.written(); let written = written.drain(); assert_eq!( @@ -1566,28 +1565,33 @@ mod tests { written.len() ); - let (path, file_metadata) = written.take(1).next().unwrap(); - Ok((path, file_metadata)) + let (path, parquet_meta_data) = written.take(1).next().unwrap(); + Ok((path, parquet_meta_data)) } - fn assert_file_metadata(file_metadata: FileMetaData, expected_kv: &Vec) { - let FileMetaData { - num_rows, - schema, - key_value_metadata, - .. - } = file_metadata; - assert_eq!(num_rows, 2, "file metadata to have 2 rows"); + fn assert_file_metadata( + parquet_meta_data: ParquetMetaData, + expected_kv: &Vec, + ) { + let file_metadata = parquet_meta_data.file_metadata(); + let schema_descr = file_metadata.schema_descr(); + assert_eq!(file_metadata.num_rows(), 2, "file metadata to have 2 rows"); assert!( - schema.iter().any(|col_schema| col_schema.name == "a"), + schema_descr + .columns() + .iter() + .any(|col_schema| col_schema.name() == "a"), "output file metadata should contain col a" ); assert!( - schema.iter().any(|col_schema| col_schema.name == "b"), + schema_descr + .columns() + .iter() + .any(|col_schema| col_schema.name() == "b"), "output file metadata should contain col b" ); - let mut key_value_metadata = key_value_metadata.unwrap(); + let mut key_value_metadata = file_metadata.key_value_metadata().unwrap().clone(); key_value_metadata.sort_by(|a, b| a.key.cmp(&b.key)); assert_eq!(&key_value_metadata, expected_kv); } @@ -1644,13 +1648,11 @@ mod tests { // check the file metadata includes partitions let mut expected_partitions = std::collections::HashSet::from(["a=foo", "a=bar"]); - for ( - path, - FileMetaData { - num_rows, schema, .. - }, - ) in written.take(2) - { + for (path, parquet_metadata) in written.take(2) { + let file_metadata = parquet_metadata.file_metadata(); + let schema = file_metadata.schema_descr(); + let num_rows = file_metadata.num_rows(); + let path_parts = path.parts().collect::>(); assert_eq!(path_parts.len(), 2, "should have path prefix"); @@ -1663,11 +1665,17 @@ mod tests { assert_eq!(num_rows, 1, "file metadata to have 1 row"); assert!( - !schema.iter().any(|col_schema| col_schema.name == "a"), + !schema + .columns() + .iter() + .any(|col_schema| col_schema.name() == "a"), "output file metadata will not contain partitioned col a" ); assert!( - schema.iter().any(|col_schema| col_schema.name == "b"), + schema + .columns() + .iter() + .any(|col_schema| col_schema.name() == "b"), "output file metadata should contain col b" ); } diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 94d651ddadd5..620e389a0fb8 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -45,6 +45,7 @@ pub use datafusion_catalog::view; pub use datafusion_datasource::schema_adapter; pub use datafusion_datasource::sink; pub use datafusion_datasource::source; +pub use datafusion_datasource::table_schema; pub use datafusion_execution::object_store; pub use datafusion_physical_expr::create_ordering; @@ -123,16 +124,13 @@ mod tests { let f2 = Field::new("extra_column", DataType::Utf8, true); let schema = Arc::new(Schema::new(vec![f1.clone(), f2.clone()])); - let source = ParquetSource::default() + let source = ParquetSource::new(Arc::clone(&schema)) .with_schema_adapter_factory(Arc::new(TestSchemaAdapterFactory {})) .unwrap(); - let base_conf = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - schema, - source, - ) - .with_file(partitioned_file) - .build(); + let base_conf = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source) + .with_file(partitioned_file) + .build(); let parquet_exec = DataSourceExec::from_data_source(base_conf); diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index 8a00af959ccc..1cf8c573acd9 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -34,7 +34,7 @@ mod tests { use datafusion_common::{test_util, Result, ScalarValue}; use datafusion_datasource::file_format::FileFormat; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; - use datafusion_datasource::PartitionedFile; + use datafusion_datasource::{PartitionedFile, TableSchema}; use datafusion_datasource_avro::source::AvroSource; use datafusion_datasource_avro::AvroFormat; use datafusion_execution::object_store::ObjectStoreUrl; @@ -81,15 +81,11 @@ mod tests { .infer_schema(&state, &store, std::slice::from_ref(&meta)) .await?; - let source = Arc::new(AvroSource::new()); - let conf = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - file_schema, - source, - ) - .with_file(meta.into()) - .with_projection(Some(vec![0, 1, 2])) - .build(); + let source = Arc::new(AvroSource::new(Arc::clone(&file_schema))); + let conf = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source) + .with_file(meta.into()) + .with_projection_indices(Some(vec![0, 1, 2])) + .build(); let source_exec = DataSourceExec::from_data_source(conf); assert_eq!( @@ -157,10 +153,10 @@ mod tests { // Include the missing column in the projection let projection = Some(vec![0, 1, 2, actual_schema.fields().len()]); - let source = Arc::new(AvroSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let source = Arc::new(AvroSource::new(Arc::clone(&file_schema))); + let conf = FileScanConfigBuilder::new(object_store_url, source) .with_file(meta.into()) - .with_projection(projection) + .with_projection_indices(projection) .build(); let source_exec = DataSourceExec::from_data_source(conf); @@ -227,13 +223,16 @@ mod tests { partitioned_file.partition_values = vec![ScalarValue::from("2021-10-26")]; let projection = Some(vec![0, 1, file_schema.fields().len(), 2]); - let source = Arc::new(AvroSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let table_schema = TableSchema::new( + file_schema.clone(), + vec![Arc::new(Field::new("date", DataType::Utf8, false))], + ); + let source = Arc::new(AvroSource::new(table_schema.clone())); + let conf = FileScanConfigBuilder::new(object_store_url, source) // select specific columns of the files as well as the partitioning // column which is supposed to be the last column in the table schema. - .with_projection(projection) + .with_projection_indices(projection) .with_file(partitioned_file) - .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)]) .build(); let source_exec = DataSourceExec::from_data_source(conf); diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index b2ef51a76f89..ac5df24d4999 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -29,12 +29,14 @@ mod tests { use std::io::Write; use std::sync::Arc; + use datafusion_datasource::TableSchema; use datafusion_datasource_csv::CsvFormat; use object_store::ObjectStore; use crate::prelude::CsvReadOptions; use crate::prelude::SessionContext; use crate::test::partitioned_file_groups; + use datafusion_common::config::CsvOptions; use datafusion_common::test_util::arrow_test_data; use datafusion_common::test_util::batches_to_string; use datafusion_common::{assert_batches_eq, Result}; @@ -94,6 +96,8 @@ mod tests { async fn csv_exec_with_projection( file_compression_type: FileCompressionType, ) -> Result<()> { + use datafusion_datasource::TableSchema; + let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); let file_schema = aggr_test_schema(); @@ -110,16 +114,21 @@ mod tests { tmp_dir.path(), )?; - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_file_compression_type(file_compression_type) - .with_newlines_in_values(false) - .with_projection(Some(vec![0, 2, 4])) - .build(); + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema)); + let source = + Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + .with_file_compression_type(file_compression_type) + .with_newlines_in_values(false) + .with_projection_indices(Some(vec![0, 2, 4])) + .build(); assert_eq!(13, config.file_schema().fields().len()); let csv = DataSourceExec::from_data_source(config); @@ -158,6 +167,8 @@ mod tests { async fn csv_exec_with_mixed_order_projection( file_compression_type: FileCompressionType, ) -> Result<()> { + use datafusion_datasource::TableSchema; + let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); let session_ctx = SessionContext::new_with_config(cfg); let task_ctx = session_ctx.task_ctx(); @@ -175,16 +186,21 @@ mod tests { tmp_dir.path(), )?; - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .with_projection(Some(vec![4, 0, 2])) - .build(); + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema)); + let source = + Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .with_projection_indices(Some(vec![4, 0, 2])) + .build(); assert_eq!(13, config.file_schema().fields().len()); let csv = DataSourceExec::from_data_source(config); assert_eq!(3, csv.schema().fields().len()); @@ -221,6 +237,7 @@ mod tests { async fn csv_exec_with_limit( file_compression_type: FileCompressionType, ) -> Result<()> { + use datafusion_datasource::TableSchema; use futures::StreamExt; let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); @@ -240,16 +257,21 @@ mod tests { tmp_dir.path(), )?; - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .with_limit(Some(5)) - .build(); + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema)); + let source = + Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .with_limit(Some(5)) + .build(); assert_eq!(13, config.file_schema().fields().len()); let csv = DataSourceExec::from_data_source(config); assert_eq!(13, csv.schema().fields().len()); @@ -287,6 +309,8 @@ mod tests { async fn csv_exec_with_missing_column( file_compression_type: FileCompressionType, ) -> Result<()> { + use datafusion_datasource::TableSchema; + let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); let file_schema = aggr_test_schema_with_missing_col(); @@ -303,16 +327,21 @@ mod tests { tmp_dir.path(), )?; - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .with_limit(Some(5)) - .build(); + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema)); + let source = + Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .with_limit(Some(5)) + .build(); assert_eq!(14, config.file_schema().fields().len()); let csv = DataSourceExec::from_data_source(config); assert_eq!(14, csv.schema().fields().len()); @@ -341,6 +370,7 @@ mod tests { file_compression_type: FileCompressionType, ) -> Result<()> { use datafusion_common::ScalarValue; + use datafusion_datasource::TableSchema; let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); @@ -362,19 +392,26 @@ mod tests { let num_file_schema_fields = file_schema.fields().len(); - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)]) - // We should be able to project on the partition column - // Which is supposed to be after the file fields - .with_projection(Some(vec![0, num_file_schema_fields])) - .build(); + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let table_schema = TableSchema::new( + Arc::clone(&file_schema), + vec![Arc::new(Field::new("date", DataType::Utf8, false))], + ); + let source = + Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + // We should be able to project on the partition column + // Which is supposed to be after the file fields + .with_projection_indices(Some(vec![0, num_file_schema_fields])) + .build(); // we don't have `/date=xx/` in the path but that is ok because // partitions are resolved during scan anyway @@ -463,15 +500,20 @@ mod tests { ) .unwrap(); - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .build(); + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema)); + let source = + Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .build(); let csv = DataSourceExec::from_data_source(config); let it = csv.execute(0, task_ctx).unwrap(); diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index 0d45711c76fb..de7e87d25c84 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -176,8 +176,8 @@ mod tests { let (object_store_url, file_groups, file_schema) = prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await; - let source = Arc::new(JsonSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let source = Arc::new(JsonSource::new(Arc::clone(&file_schema))); + let conf = FileScanConfigBuilder::new(object_store_url, source) .with_file_groups(file_groups) .with_limit(Some(3)) .with_file_compression_type(file_compression_type.to_owned()) @@ -251,8 +251,8 @@ mod tests { let file_schema = Arc::new(builder.finish()); let missing_field_idx = file_schema.fields.len() - 1; - let source = Arc::new(JsonSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let source = Arc::new(JsonSource::new(Arc::clone(&file_schema))); + let conf = FileScanConfigBuilder::new(object_store_url, source) .with_file_groups(file_groups) .with_limit(Some(3)) .with_file_compression_type(file_compression_type.to_owned()) @@ -294,10 +294,10 @@ mod tests { let (object_store_url, file_groups, file_schema) = prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await; - let source = Arc::new(JsonSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let source = Arc::new(JsonSource::new(Arc::clone(&file_schema))); + let conf = FileScanConfigBuilder::new(object_store_url, source) .with_file_groups(file_groups) - .with_projection(Some(vec![0, 2])) + .with_projection_indices(Some(vec![0, 2])) .with_file_compression_type(file_compression_type.to_owned()) .build(); let exec = DataSourceExec::from_data_source(conf); @@ -342,10 +342,10 @@ mod tests { let (object_store_url, file_groups, file_schema) = prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await; - let source = Arc::new(JsonSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let source = Arc::new(JsonSource::new(Arc::clone(&file_schema))); + let conf = FileScanConfigBuilder::new(object_store_url, source) .with_file_groups(file_groups) - .with_projection(Some(vec![3, 0, 2])) + .with_projection_indices(Some(vec![3, 0, 2])) .with_file_compression_type(file_compression_type.to_owned()) .build(); let exec = DataSourceExec::from_data_source(conf); diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 10a475c1cc9a..b27dcf56e33c 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -54,7 +54,7 @@ mod tests { use datafusion_datasource::source::DataSourceExec; use datafusion_datasource::file::FileSource; - use datafusion_datasource::{FileRange, PartitionedFile}; + use datafusion_datasource::{FileRange, PartitionedFile, TableSchema}; use datafusion_datasource_parquet::source::ParquetSource; use datafusion_datasource_parquet::{ DefaultParquetFileReaderFactory, ParquetFileReaderFactory, ParquetFormat, @@ -65,7 +65,7 @@ mod tests { use datafusion_physical_plan::analyze::AnalyzeExec; use datafusion_physical_plan::collect; use datafusion_physical_plan::metrics::{ - ExecutionPlanMetricsSet, MetricType, MetricsSet, + ExecutionPlanMetricsSet, MetricType, MetricValue, MetricsSet, }; use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; @@ -161,7 +161,7 @@ mod tests { .as_ref() .map(|p| logical2physical(p, &table_schema)); - let mut source = ParquetSource::default(); + let mut source = ParquetSource::new(table_schema); if let Some(predicate) = predicate { source = source.with_predicate(predicate); } @@ -186,23 +186,19 @@ mod tests { source = source.with_bloom_filter_on_read(false); } - source.with_schema(Arc::clone(&table_schema)) + Arc::new(source) } fn build_parquet_exec( &self, - file_schema: SchemaRef, file_group: FileGroup, source: Arc, ) -> Arc { - let base_config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - file_schema, - source, - ) - .with_file_group(file_group) - .with_projection(self.projection.clone()) - .build(); + let base_config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source) + .with_file_group(file_group) + .with_projection_indices(self.projection.clone()) + .build(); DataSourceExec::from_data_source(base_config) } @@ -231,11 +227,8 @@ mod tests { // build a ParquetExec to return the results let parquet_source = self.build_file_source(Arc::clone(table_schema)); - let parquet_exec = self.build_parquet_exec( - Arc::clone(table_schema), - file_group.clone(), - Arc::clone(&parquet_source), - ); + let parquet_exec = + self.build_parquet_exec(file_group.clone(), Arc::clone(&parquet_source)); let analyze_exec = Arc::new(AnalyzeExec::new( false, @@ -243,7 +236,6 @@ mod tests { vec![MetricType::SUMMARY, MetricType::DEV], // use a new ParquetSource to avoid sharing execution metrics self.build_parquet_exec( - Arc::clone(table_schema), file_group.clone(), self.build_file_source(Arc::clone(table_schema)), ), @@ -1175,8 +1167,10 @@ mod tests { // There are 4 rows pruned in each of batch2, batch3, and // batch4 for a total of 12. batch1 had no pruning as c2 was // filled in as null - assert_eq!(get_value(&metrics, "page_index_rows_pruned"), 12); - assert_eq!(get_value(&metrics, "page_index_rows_matched"), 6); + let (page_index_pruned, page_index_matched) = + get_pruning_metric(&metrics, "page_index_rows_pruned"); + assert_eq!(page_index_pruned, 12); + assert_eq!(page_index_matched, 6); } #[tokio::test] @@ -1548,8 +1542,7 @@ mod tests { ) -> Result<()> { let config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), - file_schema, - Arc::new(ParquetSource::default()), + Arc::new(ParquetSource::new(file_schema)), ) .with_file_groups(file_groups) .build(); @@ -1651,23 +1644,26 @@ mod tests { ), ]); - let source = Arc::new(ParquetSource::default()); - let config = FileScanConfigBuilder::new(object_store_url, schema.clone(), source) - .with_file(partitioned_file) - // file has 10 cols so index 12 should be month and 13 should be day - .with_projection(Some(vec![0, 1, 2, 12, 13])) - .with_table_partition_cols(vec![ - Field::new("year", DataType::Utf8, false), - Field::new("month", DataType::UInt8, false), - Field::new( + let table_schema = TableSchema::new( + Arc::clone(&schema), + vec![ + Arc::new(Field::new("year", DataType::Utf8, false)), + Arc::new(Field::new("month", DataType::UInt8, false)), + Arc::new(Field::new( "day", DataType::Dictionary( Box::new(DataType::UInt16), Box::new(DataType::Utf8), ), false, - ), - ]) + )), + ], + ); + let source = Arc::new(ParquetSource::new(table_schema.clone())); + let config = FileScanConfigBuilder::new(object_store_url, source) + .with_file(partitioned_file) + // file has 10 cols so index 12 should be month and 13 should be day + .with_projection_indices(Some(vec![0, 1, 2, 12, 13])) .build(); let parquet_exec = DataSourceExec::from_data_source(config); @@ -1729,8 +1725,7 @@ mod tests { let file_schema = Arc::new(Schema::empty()); let config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), - file_schema, - Arc::new(ParquetSource::default()), + Arc::new(ParquetSource::new(file_schema)), ) .with_file(partitioned_file) .build(); @@ -1776,8 +1771,10 @@ mod tests { | 5 | +-----+ "###); - assert_eq!(get_value(&metrics, "page_index_rows_pruned"), 4); - assert_eq!(get_value(&metrics, "page_index_rows_matched"), 2); + let (page_index_pruned, page_index_matched) = + get_pruning_metric(&metrics, "page_index_rows_pruned"); + assert_eq!(page_index_pruned, 4); + assert_eq!(page_index_matched, 2); assert!( get_value(&metrics, "page_index_eval_time") > 0, "no eval time in metrics: {metrics:#?}" @@ -1866,8 +1863,10 @@ mod tests { assert_contains!(&explain, "predicate=c1@0 != bar"); // there's a single row group, but we can check that it matched - // if no pruning was done this would be 0 instead of 1 - assert_contains!(&explain, "row_groups_matched_statistics=1"); + assert_contains!( + &explain, + "row_groups_pruned_statistics=1 total \u{2192} 1 matched" + ); // check the projection assert_contains!(&explain, "projection=[c1]"); @@ -1898,8 +1897,10 @@ mod tests { // When both matched and pruned are 0, it means that the pruning predicate // was not used at all. - assert_contains!(&explain, "row_groups_matched_statistics=0"); - assert_contains!(&explain, "row_groups_pruned_statistics=0"); + assert_contains!( + &explain, + "row_groups_pruned_statistics=1 total \u{2192} 1 matched" + ); // But pushdown predicate should be present assert_contains!( @@ -1952,7 +1953,12 @@ mod tests { /// Panics if no such metric. fn get_value(metrics: &MetricsSet, metric_name: &str) -> usize { match metrics.sum_by_name(metric_name) { - Some(v) => v.as_usize(), + Some(v) => match v { + MetricValue::PruningMetrics { + pruning_metrics, .. + } => pruning_metrics.pruned(), + _ => v.as_usize(), + }, _ => { panic!( "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}" @@ -1961,6 +1967,20 @@ mod tests { } } + fn get_pruning_metric(metrics: &MetricsSet, metric_name: &str) -> (usize, usize) { + match metrics.sum_by_name(metric_name) { + Some(MetricValue::PruningMetrics { + pruning_metrics, .. + }) => (pruning_metrics.pruned(), pruning_metrics.matched()), + Some(_) => panic!( + "Metric '{metric_name}' is not a pruning metric in\n\n{metrics:#?}" + ), + None => panic!( + "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}" + ), + } + } + fn populate_csv_partitions( tmp_dir: &TempDir, partition_count: usize, @@ -2252,11 +2272,11 @@ mod tests { let size_hint_calls = reader_factory.metadata_size_hint_calls.clone(); let source = Arc::new( - ParquetSource::default() + ParquetSource::new(Arc::clone(&schema)) .with_parquet_file_reader_factory(reader_factory) .with_metadata_size_hint(456), ); - let config = FileScanConfigBuilder::new(store_url, schema, source) + let config = FileScanConfigBuilder::new(store_url, source) .with_file( PartitionedFile { object_meta: ObjectMeta { diff --git a/datafusion/core/src/execution/context/csv.rs b/datafusion/core/src/execution/context/csv.rs index 15d6d21f038a..e6f95886e91d 100644 --- a/datafusion/core/src/execution/context/csv.rs +++ b/datafusion/core/src/execution/context/csv.rs @@ -37,9 +37,16 @@ impl SessionContext { /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); /// // You can read a single file using `read_csv` - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// // you can also read multiple files: - /// let df = ctx.read_csv(vec!["tests/data/example.csv", "tests/data/example.csv"], CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv( + /// vec!["tests/data/example.csv", "tests/data/example.csv"], + /// CsvReadOptions::new(), + /// ) + /// .await?; /// # Ok(()) /// # } /// ``` diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 448ee5264afd..c732c2c92f64 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -76,6 +76,7 @@ pub use datafusion_execution::config::SessionConfig; use datafusion_execution::registry::SerializerRegistry; pub use datafusion_execution::TaskContext; pub use datafusion_expr::execution_props::ExecutionProps; +use datafusion_expr::simplify::SimplifyContext; use datafusion_expr::{ expr_rewriter::FunctionRewrite, logical_plan::{DdlStatement, Statement}, @@ -83,6 +84,7 @@ use datafusion_expr::{ Expr, UserDefinedLogicalNode, WindowUDF, }; use datafusion_optimizer::analyzer::type_coercion::TypeCoercion; +use datafusion_optimizer::simplify_expressions::ExprSimplifier; use datafusion_optimizer::Analyzer; use datafusion_optimizer::{AnalyzerRule, OptimizerRule}; use datafusion_session::SessionStore; @@ -166,22 +168,23 @@ where /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); -/// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; -/// let df = df.filter(col("a").lt_eq(col("b")))? -/// .aggregate(vec![col("a")], vec![min(col("b"))])? -/// .limit(0, Some(100))?; -/// let results = df -/// .collect() -/// .await?; +/// let df = ctx +/// .read_csv("tests/data/example.csv", CsvReadOptions::new()) +/// .await?; +/// let df = df +/// .filter(col("a").lt_eq(col("b")))? +/// .aggregate(vec![col("a")], vec![min(col("b"))])? +/// .limit(0, Some(100))?; +/// let results = df.collect().await?; /// assert_batches_eq!( -/// &[ -/// "+---+----------------+", -/// "| a | min(?table?.b) |", -/// "+---+----------------+", -/// "| 1 | 2 |", -/// "+---+----------------+", -/// ], -/// &results +/// &[ +/// "+---+----------------+", +/// "| a | min(?table?.b) |", +/// "+---+----------------+", +/// "| 1 | 2 |", +/// "+---+----------------+", +/// ], +/// &results /// ); /// # Ok(()) /// # } @@ -197,21 +200,22 @@ where /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); -/// ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()).await?; +/// ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()) +/// .await?; /// let results = ctx -/// .sql("SELECT a, min(b) FROM example GROUP BY a LIMIT 100") -/// .await? -/// .collect() -/// .await?; +/// .sql("SELECT a, min(b) FROM example GROUP BY a LIMIT 100") +/// .await? +/// .collect() +/// .await?; /// assert_batches_eq!( -/// &[ -/// "+---+----------------+", -/// "| a | min(example.b) |", -/// "+---+----------------+", -/// "| 1 | 2 |", -/// "+---+----------------+", -/// ], -/// &results +/// &[ +/// "+---+----------------+", +/// "| a | min(example.b) |", +/// "+---+----------------+", +/// "| 1 | 2 |", +/// "+---+----------------+", +/// ], +/// &results /// ); /// # Ok(()) /// # } @@ -231,18 +235,18 @@ where /// let config = SessionConfig::new().with_batch_size(4 * 1024); /// /// // configure a memory limit of 1GB with 20% slop -/// let runtime_env = RuntimeEnvBuilder::new() +/// let runtime_env = RuntimeEnvBuilder::new() /// .with_memory_limit(1024 * 1024 * 1024, 0.80) /// .build_arc() /// .unwrap(); /// /// // Create a SessionState using the config and runtime_env /// let state = SessionStateBuilder::new() -/// .with_config(config) -/// .with_runtime_env(runtime_env) -/// // include support for built in functions and configurations -/// .with_default_features() -/// .build(); +/// .with_config(config) +/// .with_runtime_env(runtime_env) +/// // include support for built in functions and configurations +/// .with_default_features() +/// .build(); /// /// // Create a SessionContext /// let ctx = SessionContext::from(state); @@ -428,16 +432,14 @@ impl SessionContext { /// # use datafusion::prelude::*; /// # use datafusion::execution::SessionStateBuilder; /// # use datafusion_optimizer::push_down_filter::PushDownFilter; - /// let my_rule = PushDownFilter{}; // pretend it is a new rule - /// // Create a new builder with a custom optimizer rule + /// let my_rule = PushDownFilter {}; // pretend it is a new rule + /// // Create a new builder with a custom optimizer rule /// let context: SessionContext = SessionStateBuilder::new() - /// .with_optimizer_rule(Arc::new(my_rule)) - /// .build() - /// .into(); + /// .with_optimizer_rule(Arc::new(my_rule)) + /// .build() + /// .into(); /// // Enable local file access and convert context back to a builder - /// let builder = context - /// .enable_url_table() - /// .into_state_builder(); + /// let builder = context.enable_url_table().into_state_builder(); /// ``` pub fn into_state_builder(self) -> SessionStateBuilder { let SessionContext { @@ -585,11 +587,10 @@ impl SessionContext { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// ctx - /// .sql("CREATE TABLE foo (x INTEGER)") - /// .await? - /// .collect() - /// .await?; + /// ctx.sql("CREATE TABLE foo (x INTEGER)") + /// .await? + /// .collect() + /// .await?; /// assert!(ctx.table_exist("foo").unwrap()); /// # Ok(()) /// # } @@ -614,14 +615,14 @@ impl SessionContext { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let options = SQLOptions::new() - /// .with_allow_ddl(false); - /// let err = ctx.sql_with_options("CREATE TABLE foo (x INTEGER)", options) - /// .await - /// .unwrap_err(); - /// assert!( - /// err.to_string().starts_with("Error during planning: DDL not supported: CreateMemoryTable") - /// ); + /// let options = SQLOptions::new().with_allow_ddl(false); + /// let err = ctx + /// .sql_with_options("CREATE TABLE foo (x INTEGER)", options) + /// .await + /// .unwrap_err(); + /// assert!(err + /// .to_string() + /// .starts_with("Error during planning: DDL not supported: CreateMemoryTable")); /// # Ok(()) /// # } /// ``` @@ -653,8 +654,7 @@ impl SessionContext { /// // provide type information that `a` is an Int32 /// let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); /// let df_schema = DFSchema::try_from(schema).unwrap(); - /// let expr = SessionContext::new() - /// .parse_sql_expr(sql, &df_schema)?; + /// let expr = SessionContext::new().parse_sql_expr(sql, &df_schema)?; /// assert_eq!(expected, expr); /// # Ok(()) /// # } @@ -1143,8 +1143,14 @@ impl SessionContext { /// ``` /// use datafusion::execution::context::SessionContext; /// - /// assert_eq!(SessionContext::parse_memory_limit("1M").unwrap(), 1024 * 1024); - /// assert_eq!(SessionContext::parse_memory_limit("1.5G").unwrap(), (1.5 * 1024.0 * 1024.0 * 1024.0) as usize); + /// assert_eq!( + /// SessionContext::parse_memory_limit("1M").unwrap(), + /// 1024 * 1024 + /// ); + /// assert_eq!( + /// SessionContext::parse_memory_limit("1.5G").unwrap(), + /// (1.5 * 1024.0 * 1024.0 * 1024.0) as usize + /// ); /// ``` pub fn parse_memory_limit(limit: &str) -> Result { let (number, unit) = limit.split_at(limit.len() - 1); @@ -1265,14 +1271,18 @@ impl SessionContext { exec_datafusion_err!("Prepared statement '{}' does not exist", name) })?; + let state = self.state.read(); + let context = SimplifyContext::new(state.execution_props()); + let simplifier = ExprSimplifier::new(context); + // Only allow literals as parameters for now. let mut params: Vec = parameters .into_iter() - .map(|e| match e { + .map(|e| match simplifier.simplify(e)? { Expr::Literal(scalar, metadata) => { Ok(ScalarAndMetadata::new(scalar, metadata)) } - _ => not_impl_err!("Unsupported parameter type: {}", e), + e => not_impl_err!("Unsupported parameter type: {e}"), }) .collect::>()?; diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 561e0c363a37..d7a66db28ac4 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -37,7 +37,9 @@ use datafusion_catalog::information_schema::{ use datafusion_catalog::MemoryCatalogProviderList; use datafusion_catalog::{TableFunction, TableFunctionImpl}; use datafusion_common::alias::AliasGenerator; -use datafusion_common::config::{ConfigExtension, ConfigOptions, Dialect, TableOptions}; +#[cfg(feature = "sql")] +use datafusion_common::config::Dialect; +use datafusion_common::config::{ConfigExtension, ConfigOptions, TableOptions}; use datafusion_common::display::{PlanType, StringifiedPlan, ToStringifiedPlan}; use datafusion_common::tree_node::TreeNode; use datafusion_common::{ @@ -114,12 +116,12 @@ use uuid::Uuid; /// # use std::sync::Arc; /// # #[tokio::main] /// # async fn main() -> Result<()> { -/// let state = SessionStateBuilder::new() -/// .with_config(SessionConfig::new()) -/// .with_runtime_env(Arc::new(RuntimeEnv::default())) -/// .with_default_features() -/// .build(); -/// Ok(()) +/// let state = SessionStateBuilder::new() +/// .with_config(SessionConfig::new()) +/// .with_runtime_env(Arc::new(RuntimeEnv::default())) +/// .with_default_features() +/// .build(); +/// Ok(()) /// # } /// ``` /// @@ -545,6 +547,16 @@ impl SessionState { let sql_expr = self.sql_to_expr_with_alias(sql, &dialect)?; + self.create_logical_expr_from_sql_expr(sql_expr, df_schema) + } + + /// Creates a datafusion style AST [`Expr`] from a SQL expression. + #[cfg(feature = "sql")] + pub fn create_logical_expr_from_sql_expr( + &self, + sql_expr: SQLExprWithAlias, + df_schema: &DFSchema, + ) -> datafusion_common::Result { let provider = SessionContextProvider { state: self, tables: HashMap::new(), @@ -2095,6 +2107,36 @@ mod tests { assert!(sql_to_expr(&state).is_err()) } + #[test] + #[cfg(feature = "sql")] + fn test_create_logical_expr_from_sql_expr() { + let state = SessionStateBuilder::new().with_default_features().build(); + + let provider = SessionContextProvider { + state: &state, + tables: HashMap::new(), + }; + + let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); + let df_schema = DFSchema::try_from(schema).unwrap(); + let dialect = state.config.options().sql_parser.dialect; + let query = SqlToRel::new_with_options(&provider, state.get_parser_options()); + + for sql in ["[1,2,3]", "a > 10", "SUM(a)"] { + let sql_expr = state.sql_to_expr(sql, &dialect).unwrap(); + let from_str = query + .sql_to_expr(sql_expr, &df_schema, &mut PlannerContext::new()) + .unwrap(); + + let sql_expr_with_alias = + state.sql_to_expr_with_alias(sql, &dialect).unwrap(); + let from_expr = state + .create_logical_expr_from_sql_expr(sql_expr_with_alias, &df_schema) + .unwrap(); + assert_eq!(from_str, from_expr); + } + } + #[test] fn test_from_existing() -> Result<()> { fn employee_batch() -> RecordBatch { diff --git a/datafusion/core/src/execution/session_state_defaults.rs b/datafusion/core/src/execution/session_state_defaults.rs index baf396f3f1c5..62a575541a5d 100644 --- a/datafusion/core/src/execution/session_state_defaults.rs +++ b/datafusion/core/src/execution/session_state_defaults.rs @@ -101,7 +101,7 @@ impl SessionStateDefaults { expr_planners } - /// returns the list of default [`ScalarUDF']'s + /// returns the list of default [`ScalarUDF`]s pub fn default_scalar_functions() -> Vec> { #[cfg_attr(not(feature = "nested_expressions"), allow(unused_mut))] let mut functions: Vec> = functions::all_default_functions(); @@ -112,12 +112,12 @@ impl SessionStateDefaults { functions } - /// returns the list of default [`AggregateUDF']'s + /// returns the list of default [`AggregateUDF`]s pub fn default_aggregate_functions() -> Vec> { functions_aggregate::all_default_aggregate_functions() } - /// returns the list of default [`WindowUDF']'s + /// returns the list of default [`WindowUDF`]s pub fn default_window_functions() -> Vec> { functions_window::all_default_window_functions() } @@ -127,7 +127,7 @@ impl SessionStateDefaults { functions_table::all_default_table_functions() } - /// returns the list of default [`FileFormatFactory']'s + /// returns the list of default [`FileFormatFactory`]s pub fn default_file_formats() -> Vec> { let file_formats: Vec> = vec![ #[cfg(feature = "parquet")] diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index e7ace544a11c..381dd5e9e848 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -86,26 +86,29 @@ //! let ctx = SessionContext::new(); //! //! // create the dataframe -//! let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; +//! let df = ctx +//! .read_csv("tests/data/example.csv", CsvReadOptions::new()) +//! .await?; //! //! // create a plan -//! let df = df.filter(col("a").lt_eq(col("b")))? -//! .aggregate(vec![col("a")], vec![min(col("b"))])? -//! .limit(0, Some(100))?; +//! let df = df +//! .filter(col("a").lt_eq(col("b")))? +//! .aggregate(vec![col("a")], vec![min(col("b"))])? +//! .limit(0, Some(100))?; //! //! // execute the plan //! let results: Vec = df.collect().await?; //! //! // format the results -//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)? -//! .to_string(); +//! let pretty_results = +//! arrow::util::pretty::pretty_format_batches(&results)?.to_string(); //! //! let expected = vec![ //! "+---+----------------+", //! "| a | min(?table?.b) |", //! "+---+----------------+", //! "| 1 | 2 |", -//! "+---+----------------+" +//! "+---+----------------+", //! ]; //! //! assert_eq!(pretty_results.trim().lines().collect::>(), expected); @@ -126,24 +129,27 @@ //! # async fn main() -> Result<()> { //! let ctx = SessionContext::new(); //! -//! ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()).await?; +//! ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()) +//! .await?; //! //! // create a plan -//! let df = ctx.sql("SELECT a, MIN(b) FROM example WHERE a <= b GROUP BY a LIMIT 100").await?; +//! let df = ctx +//! .sql("SELECT a, MIN(b) FROM example WHERE a <= b GROUP BY a LIMIT 100") +//! .await?; //! //! // execute the plan //! let results: Vec = df.collect().await?; //! //! // format the results -//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)? -//! .to_string(); +//! let pretty_results = +//! arrow::util::pretty::pretty_format_batches(&results)?.to_string(); //! //! let expected = vec![ //! "+---+----------------+", //! "| a | min(example.b) |", //! "+---+----------------+", //! "| 1 | 2 |", -//! "+---+----------------+" +//! "+---+----------------+", //! ]; //! //! assert_eq!(pretty_results.trim().lines().collect::>(), expected); @@ -443,7 +449,30 @@ //! other operators read a single [`RecordBatch`] from their input to produce a //! single [`RecordBatch`] as output. //! -//! For example, given this SQL query: +//! For example, given this SQL: +//! +//! ```sql +//! SELECT name FROM 'data.parquet' WHERE id > 10 +//! ``` +//! +//! An simplified DataFusion execution plan is shown below. It first reads +//! data from the Parquet file, then applies the filter, then the projection, +//! and finally produces output. Each step processes one [`RecordBatch`] at a +//! time. Multiple batches are processed concurrently on different CPU cores +//! for plans with multiple partitions. +//! +//! ```text +//! ┌─────────────┐ ┌──────────────┐ ┌────────────────┐ ┌──────────────────┐ ┌──────────┐ +//! │ Parquet │───▶│ DataSource │───▶│ FilterExec │───▶│ ProjectionExec │───▶│ Results │ +//! │ File │ │ │ │ │ │ │ │ │ +//! └─────────────┘ └──────────────┘ └────────────────┘ └──────────────────┘ └──────────┘ +//! (reads data) (id > 10) (keeps "name" col) +//! RecordBatch ───▶ RecordBatch ────▶ RecordBatch ────▶ RecordBatch +//! ``` +//! +//! DataFusion uses the classic "pull" based control flow (explained more in the +//! next section) to implement streaming execution. As an example, +//! consider the following SQL query: //! //! ```sql //! SELECT date_trunc('month', time) FROM data WHERE id IN (10,20,30); @@ -607,7 +636,7 @@ //! └─────────────┘ ┗━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━┛ //! ─────────────────────────────────────────────────────────────▶ //! time -//!``` +//! ``` //! //! Note that DataFusion does not use [`tokio::task::spawn_blocking`] for //! CPU-bounded work, because `spawn_blocking` is designed for blocking **IO**, @@ -897,6 +926,12 @@ doc_comment::doctest!("../../../README.md", readme_example_test); // For example, if `user_guide_expressions(line 123)` fails, // go to `docs/source/user-guide/expressions.md` to find the relevant problem. // +#[cfg(doctest)] +doc_comment::doctest!( + "../../../docs/source/user-guide/arrow-introduction.md", + user_guide_arrow_introduction +); + #[cfg(doctest)] doc_comment::doctest!( "../../../docs/source/user-guide/concepts-readings-events.md", diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 708c52001ee8..6a75485c6284 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -64,7 +64,9 @@ use datafusion_catalog::ScanArgs; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::format::ExplainAnalyzeLevel; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; -use datafusion_common::TableReference; +use datafusion_common::{ + assert_eq_or_internal_err, assert_or_internal_err, TableReference, +}; use datafusion_common::{ exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, ScalarValue, @@ -347,11 +349,11 @@ impl DefaultPhysicalPlanner { .flatten() .collect::>(); // Ideally this never happens if we have a valid LogicalPlan tree - if outputs.len() != 1 { - return internal_err!( - "Failed to convert LogicalPlan to ExecutionPlan: More than one root detected" - ); - } + assert_eq_or_internal_err!( + outputs.len(), + 1, + "Failed to convert LogicalPlan to ExecutionPlan: More than one root detected" + ); let plan = outputs.pop().unwrap(); Ok(plan) } @@ -588,9 +590,10 @@ impl DefaultPhysicalPlanner { } } LogicalPlan::Window(Window { window_expr, .. }) => { - if window_expr.is_empty() { - return internal_err!("Impossibly got empty window expression"); - } + assert_or_internal_err!( + !window_expr.is_empty(), + "Impossibly got empty window expression" + ); let input_exec = children.one()?; @@ -1764,14 +1767,12 @@ fn qualify_join_schema_sides( .zip(left_fields.iter().chain(right_fields.iter())) .enumerate() { - if field.name() != expected.name() { - return internal_err!( - "Field name mismatch at index {}: expected '{}', found '{}'", - i, - expected.name(), - field.name() - ); - } + assert_eq_or_internal_err!( + field.name(), + expected.name(), + "Field name mismatch at index {}", + i + ); } // qualify sides @@ -2644,7 +2645,7 @@ mod tests { // verify that the plan correctly casts u8 to i64 // the cast from u8 to i64 for literal will be simplified, and get lit(int64(5)) // the cast here is implicit so has CastOptions with safe=true - let expected = r#"BinaryExpr { left: Column { name: "c7", index: 2 }, op: Lt, right: Literal { value: Int64(5), field: Field { name: "lit", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, fail_on_overflow: false }"#; + let expected = r#"BinaryExpr { left: Column { name: "c7", index: 2 }, op: Lt, right: Literal { value: Int64(5), field: Field { name: "lit", data_type: Int64 } }, fail_on_overflow: false"#; assert_contains!(format!("{exec_plan:?}"), expected); Ok(()) @@ -2704,9 +2705,6 @@ mod tests { name: "lit", data_type: Utf8, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c1", @@ -2718,9 +2716,6 @@ mod tests { name: "lit", data_type: Int64, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c2", @@ -2732,9 +2727,6 @@ mod tests { name: "lit", data_type: Int64, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c3", @@ -2843,9 +2835,6 @@ mod tests { name: "lit", data_type: Utf8, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c1", @@ -2857,9 +2846,6 @@ mod tests { name: "lit", data_type: Int64, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c2", @@ -2871,9 +2857,6 @@ mod tests { name: "lit", data_type: Int64, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c3", @@ -3047,7 +3030,7 @@ mod tests { .expect_err("planning error") .strip_backtrace(); - insta::assert_snapshot!(e, @r#"Error during planning: Extension planner for NoOp created an ExecutionPlan with mismatched schema. LogicalPlan schema: DFSchema { inner: Schema { fields: [Field { name: "a", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }, field_qualifiers: [None], functional_dependencies: FunctionalDependencies { deps: [] } }, ExecutionPlan schema: Schema { fields: [Field { name: "b", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }"#); + insta::assert_snapshot!(e, @r#"Error during planning: Extension planner for NoOp created an ExecutionPlan with mismatched schema. LogicalPlan schema: DFSchema { inner: Schema { fields: [Field { name: "a", data_type: Int32 }], metadata: {} }, field_qualifiers: [None], functional_dependencies: FunctionalDependencies { deps: [] } }, ExecutionPlan schema: Schema { fields: [Field { name: "b", data_type: Int32 }], metadata: {} }"#); } #[tokio::test] @@ -3063,7 +3046,7 @@ mod tests { let execution_plan = plan(&logical_plan).await?; // verify that the plan correctly adds cast from Int64(1) to Utf8, and the const will be evaluated. - let expected = "exprs: [ProjectionExpr { expr: BinaryExpr { left: BinaryExpr { left: Column { name: \"c1\", index: 0 }, op: Eq, right: Literal { value: Utf8(\"a\"), field: Field { name: \"lit\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, fail_on_overflow: false }, op: Or, right: BinaryExpr { left: Column { name: \"c1\", index: 0 }, op: Eq, right: Literal { value: Utf8(\"1\"), field: Field { name: \"lit\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, fail_on_overflow: false }, fail_on_overflow: false }"; + let expected = r#"expr: BinaryExpr { left: BinaryExpr { left: Column { name: "c1", index: 0 }, op: Eq, right: Literal { value: Utf8("a"), field: Field { name: "lit", data_type: Utf8 } }, fail_on_overflow: false }"#; assert_contains!(format!("{execution_plan:?}"), expected); @@ -3085,7 +3068,7 @@ mod tests { assert_contains!( &e, - r#"Error during planning: Can not find compatible types to compare Boolean with [Struct(foo Boolean), Utf8]"# + r#"Error during planning: Can not find compatible types to compare Boolean with [Struct("foo": Boolean), Utf8]"# ); Ok(()) diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index 68f83e7f1f11..bbc85af7d874 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -35,12 +35,15 @@ use crate::error::Result; use crate::logical_expr::LogicalPlan; use crate::test_util::{aggr_test_schema, arrow_test_data}; +use datafusion_common::config::CsvOptions; + use arrow::array::{self, Array, ArrayRef, Decimal128Builder, Int32Array}; use arrow::datatypes::{DataType, Field, Schema}; use arrow::record_batch::RecordBatch; #[cfg(feature = "compression")] use datafusion_common::DataFusionError; use datafusion_datasource::source::DataSourceExec; +use datafusion_datasource::TableSchema; #[cfg(feature = "compression")] use bzip2::write::BzEncoder; @@ -92,11 +95,17 @@ pub fn scan_partitioned_csv( FileCompressionType::UNCOMPRESSED, work_dir, )?; - let source = Arc::new(CsvSource::new(true, b'"', b'"')); - let config = - FileScanConfigBuilder::from(partitioned_csv_config(schema, file_groups, source)) - .with_file_compression_type(FileCompressionType::UNCOMPRESSED) - .build(); + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let table_schema = TableSchema::from_file_schema(schema); + let source = Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); + let config = FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + .with_file_compression_type(FileCompressionType::UNCOMPRESSED) + .build(); Ok(DataSourceExec::from_data_source(config)) } diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs index eb4c61c02524..b5213cee3f2d 100644 --- a/datafusion/core/src/test_util/parquet.rs +++ b/datafusion/core/src/test_util/parquet.rs @@ -37,7 +37,6 @@ use crate::physical_plan::metrics::MetricsSet; use crate::physical_plan::ExecutionPlan; use crate::prelude::{Expr, SessionConfig, SessionContext}; -use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::source::DataSourceExec; use object_store::path::Path; @@ -156,20 +155,21 @@ impl TestParquetFile { maybe_filter: Option, ) -> Result> { let parquet_options = ctx.copied_table_options().parquet; - let source = Arc::new(ParquetSource::new(parquet_options.clone())); - let scan_config_builder = FileScanConfigBuilder::new( - self.object_store_url.clone(), - Arc::clone(&self.schema), - source, - ) - .with_file(PartitionedFile { - object_meta: self.object_meta.clone(), - partition_values: vec![], - range: None, - statistics: None, - extensions: None, - metadata_size_hint: None, - }); + let source = Arc::new( + ParquetSource::new(Arc::clone(&self.schema)) + .with_table_parquet_options(parquet_options.clone()), + ); + let scan_config_builder = + FileScanConfigBuilder::new(self.object_store_url.clone(), source).with_file( + PartitionedFile { + object_meta: self.object_meta.clone(), + partition_values: vec![], + range: None, + statistics: None, + extensions: None, + metadata_size_hint: None, + }, + ); let df_schema = Arc::clone(&self.schema).to_dfschema_ref()?; @@ -183,10 +183,10 @@ impl TestParquetFile { create_physical_expr(&filter, &df_schema, &ExecutionProps::default())?; let source = Arc::new( - ParquetSource::new(parquet_options) + ParquetSource::new(Arc::clone(&self.schema)) + .with_table_parquet_options(parquet_options) .with_predicate(Arc::clone(&physical_filter_expr)), - ) - .with_schema(Arc::clone(&self.schema)); + ); let config = scan_config_builder.with_source(source).build(); let parquet_exec = DataSourceExec::from_data_source(config); diff --git a/datafusion/core/tests/catalog_listing/mod.rs b/datafusion/core/tests/catalog_listing/mod.rs new file mode 100644 index 000000000000..cb6cac4fb067 --- /dev/null +++ b/datafusion/core/tests/catalog_listing/mod.rs @@ -0,0 +1,18 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod pruned_partition_list; diff --git a/datafusion/core/tests/catalog_listing/pruned_partition_list.rs b/datafusion/core/tests/catalog_listing/pruned_partition_list.rs new file mode 100644 index 000000000000..3cdaa3bb9b34 --- /dev/null +++ b/datafusion/core/tests/catalog_listing/pruned_partition_list.rs @@ -0,0 +1,251 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow_schema::DataType; +use futures::{FutureExt, StreamExt as _, TryStreamExt as _}; +use object_store::{memory::InMemory, path::Path, ObjectStore as _}; + +use datafusion::execution::SessionStateBuilder; +use datafusion_catalog_listing::helpers::{ + describe_partition, list_partitions, pruned_partition_list, +}; +use datafusion_common::ScalarValue; +use datafusion_datasource::ListingTableUrl; +use datafusion_expr::{col, lit, Expr}; +use datafusion_session::Session; + +#[tokio::test] +async fn test_pruned_partition_list_empty() { + let (store, state) = make_test_store_and_state(&[ + ("tablepath/mypartition=val1/notparquetfile", 100), + ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0), + ("tablepath/file.parquet", 100), + ("tablepath/notapartition/file.parquet", 100), + ("tablepath/notmypartition=val1/file.parquet", 100), + ]); + let filter = Expr::eq(col("mypartition"), lit("val1")); + let pruned = pruned_partition_list( + state.as_ref(), + store.as_ref(), + &ListingTableUrl::parse("file:///tablepath/").unwrap(), + &[filter], + ".parquet", + &[(String::from("mypartition"), DataType::Utf8)], + ) + .await + .expect("partition pruning failed") + .collect::>() + .await; + + assert_eq!(pruned.len(), 0); +} + +#[tokio::test] +async fn test_pruned_partition_list() { + let (store, state) = make_test_store_and_state(&[ + ("tablepath/mypartition=val1/file.parquet", 100), + ("tablepath/mypartition=val2/file.parquet", 100), + ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0), + ("tablepath/mypartition=val1/other=val3/file.parquet", 100), + ("tablepath/notapartition/file.parquet", 100), + ("tablepath/notmypartition=val1/file.parquet", 100), + ]); + let filter = Expr::eq(col("mypartition"), lit("val1")); + let pruned = pruned_partition_list( + state.as_ref(), + store.as_ref(), + &ListingTableUrl::parse("file:///tablepath/").unwrap(), + &[filter], + ".parquet", + &[(String::from("mypartition"), DataType::Utf8)], + ) + .await + .expect("partition pruning failed") + .try_collect::>() + .await + .unwrap(); + + assert_eq!(pruned.len(), 2); + let f1 = &pruned[0]; + assert_eq!( + f1.object_meta.location.as_ref(), + "tablepath/mypartition=val1/file.parquet" + ); + assert_eq!(&f1.partition_values, &[ScalarValue::from("val1")]); + let f2 = &pruned[1]; + assert_eq!( + f2.object_meta.location.as_ref(), + "tablepath/mypartition=val1/other=val3/file.parquet" + ); + assert_eq!(f2.partition_values, &[ScalarValue::from("val1"),]); +} + +#[tokio::test] +async fn test_pruned_partition_list_multi() { + let (store, state) = make_test_store_and_state(&[ + ("tablepath/part1=p1v1/file.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100), + ("tablepath/part1=p1v3/part2=p2v1/file2.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v2/file2.parquet", 100), + ]); + let filter1 = Expr::eq(col("part1"), lit("p1v2")); + let filter2 = Expr::eq(col("part2"), lit("p2v1")); + let pruned = pruned_partition_list( + state.as_ref(), + store.as_ref(), + &ListingTableUrl::parse("file:///tablepath/").unwrap(), + &[filter1, filter2], + ".parquet", + &[ + (String::from("part1"), DataType::Utf8), + (String::from("part2"), DataType::Utf8), + ], + ) + .await + .expect("partition pruning failed") + .try_collect::>() + .await + .unwrap(); + + assert_eq!(pruned.len(), 2); + let f1 = &pruned[0]; + assert_eq!( + f1.object_meta.location.as_ref(), + "tablepath/part1=p1v2/part2=p2v1/file1.parquet" + ); + assert_eq!( + &f1.partition_values, + &[ScalarValue::from("p1v2"), ScalarValue::from("p2v1"),] + ); + let f2 = &pruned[1]; + assert_eq!( + f2.object_meta.location.as_ref(), + "tablepath/part1=p1v2/part2=p2v1/file2.parquet" + ); + assert_eq!( + &f2.partition_values, + &[ScalarValue::from("p1v2"), ScalarValue::from("p2v1")] + ); +} + +#[tokio::test] +async fn test_list_partition() { + let (store, _) = make_test_store_and_state(&[ + ("tablepath/part1=p1v1/file.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100), + ("tablepath/part1=p1v3/part2=p2v1/file3.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v2/file4.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v2/empty.parquet", 0), + ]); + + let partitions = list_partitions( + store.as_ref(), + &ListingTableUrl::parse("file:///tablepath/").unwrap(), + 0, + None, + ) + .await + .expect("listing partitions failed"); + + assert_eq!( + &partitions + .iter() + .map(describe_partition) + .collect::>(), + &vec![ + ("tablepath", 0, vec![]), + ("tablepath/part1=p1v1", 1, vec![]), + ("tablepath/part1=p1v2", 1, vec![]), + ("tablepath/part1=p1v3", 1, vec![]), + ] + ); + + let partitions = list_partitions( + store.as_ref(), + &ListingTableUrl::parse("file:///tablepath/").unwrap(), + 1, + None, + ) + .await + .expect("listing partitions failed"); + + assert_eq!( + &partitions + .iter() + .map(describe_partition) + .collect::>(), + &vec![ + ("tablepath", 0, vec![]), + ("tablepath/part1=p1v1", 1, vec!["file.parquet"]), + ("tablepath/part1=p1v2", 1, vec![]), + ("tablepath/part1=p1v2/part2=p2v1", 2, vec![]), + ("tablepath/part1=p1v2/part2=p2v2", 2, vec![]), + ("tablepath/part1=p1v3", 1, vec![]), + ("tablepath/part1=p1v3/part2=p2v1", 2, vec![]), + ] + ); + + let partitions = list_partitions( + store.as_ref(), + &ListingTableUrl::parse("file:///tablepath/").unwrap(), + 2, + None, + ) + .await + .expect("listing partitions failed"); + + assert_eq!( + &partitions + .iter() + .map(describe_partition) + .collect::>(), + &vec![ + ("tablepath", 0, vec![]), + ("tablepath/part1=p1v1", 1, vec!["file.parquet"]), + ("tablepath/part1=p1v2", 1, vec![]), + ("tablepath/part1=p1v3", 1, vec![]), + ( + "tablepath/part1=p1v2/part2=p2v1", + 2, + vec!["file1.parquet", "file2.parquet"] + ), + ("tablepath/part1=p1v2/part2=p2v2", 2, vec!["file4.parquet"]), + ("tablepath/part1=p1v3/part2=p2v1", 2, vec!["file3.parquet"]), + ] + ); +} + +pub fn make_test_store_and_state( + files: &[(&str, u64)], +) -> (Arc, Arc) { + let memory = InMemory::new(); + + for (name, size) in files { + memory + .put(&Path::from(*name), vec![0; *size as usize].into()) + .now_or_never() + .unwrap() + .unwrap(); + } + + let state = SessionStateBuilder::new().build(); + (Arc::new(memory), Arc::new(state)) +} diff --git a/datafusion/core/tests/core_integration.rs b/datafusion/core/tests/core_integration.rs index edcf039e4e70..cc4dfcf72059 100644 --- a/datafusion/core/tests/core_integration.rs +++ b/datafusion/core/tests/core_integration.rs @@ -57,6 +57,9 @@ mod serde; /// Run all tests that are found in the `catalog` directory mod catalog; +/// Run all tests that are found in the `catalog_listing` directory +mod catalog_listing; + /// Run all tests that are found in the `tracing` directory mod tracing; diff --git a/datafusion/core/tests/data/partitioned_table_arrow_stream/part=123/data.arrow b/datafusion/core/tests/data/partitioned_table_arrow_stream/part=123/data.arrow new file mode 100644 index 000000000000..bad9e3de4a57 Binary files /dev/null and b/datafusion/core/tests/data/partitioned_table_arrow_stream/part=123/data.arrow differ diff --git a/datafusion/core/tests/data/partitioned_table_arrow_stream/part=456/data.arrow b/datafusion/core/tests/data/partitioned_table_arrow_stream/part=456/data.arrow new file mode 100644 index 000000000000..4a07fbfa47f3 Binary files /dev/null and b/datafusion/core/tests/data/partitioned_table_arrow_stream/part=456/data.arrow differ diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index d95eb38c19e1..265862ff9af8 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -309,16 +309,16 @@ async fn test_fn_arrow_typeof() -> Result<()> { assert_snapshot!( batches_to_string(&batches), - @r#" - +------------------------------------------------------------------------------------------------------------------+ - | arrow_typeof(test.l) | - +------------------------------------------------------------------------------------------------------------------+ - | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | - | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | - | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | - | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | - +------------------------------------------------------------------------------------------------------------------+ - "#); + @r" + +----------------------+ + | arrow_typeof(test.l) | + +----------------------+ + | List(nullable Int32) | + | List(nullable Int32) | + | List(nullable Int32) | + | List(nullable Int32) | + +----------------------+ + "); Ok(()) } diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 979ada2bc6bb..aab51efa1e34 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -45,6 +45,7 @@ use insta::assert_snapshot; use object_store::local::LocalFileSystem; use std::collections::HashMap; use std::fs; +use std::path::Path; use std::sync::Arc; use tempfile::TempDir; use url::Url; @@ -77,7 +78,7 @@ use datafusion_expr::var_provider::{VarProvider, VarType}; use datafusion_expr::{ cast, col, create_udf, exists, in_subquery, lit, out_ref_col, placeholder, scalar_subquery, when, wildcard, Expr, ExprFunctionExt, ExprSchemable, LogicalPlan, - LogicalPlanBuilder, ScalarFunctionImplementation, SortExpr, WindowFrame, + LogicalPlanBuilder, ScalarFunctionImplementation, SortExpr, TableType, WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition, }; use datafusion_physical_expr::aggregate::AggregateExprBuilder; @@ -404,6 +405,55 @@ async fn select_with_periods() -> Result<()> { Ok(()) } +#[tokio::test] +async fn select_columns_duplicated_names_from_different_qualifiers() -> Result<()> { + let t1 = test_table_with_name("t1") + .await? + .select_columns(&["c1"])? + .limit(0, Some(3))?; + let t2 = test_table_with_name("t2") + .await? + .select_columns(&["c1"])? + .limit(3, Some(3))?; + let t3 = test_table_with_name("t3") + .await? + .select_columns(&["c1"])? + .limit(6, Some(3))?; + + let join_res = t1 + .join(t2, JoinType::Left, &["t1.c1"], &["t2.c1"], None)? + .join(t3, JoinType::Left, &["t1.c1"], &["t3.c1"], None)?; + assert_snapshot!( + batches_to_sort_string(&join_res.clone().collect().await.unwrap()), + @r" + +----+----+----+ + | c1 | c1 | c1 | + +----+----+----+ + | b | b | | + | b | b | | + | c | | | + | d | | d | + +----+----+----+ + " + ); + + let select_res = join_res.select_columns(&["c1"])?; + assert_snapshot!( + batches_to_sort_string(&select_res.clone().collect().await.unwrap()), + @r" + +----+----+----+ + | c1 | c1 | c1 | + +----+----+----+ + | b | b | | + | b | b | | + | c | | | + | d | | d | + +----+----+----+ + " + ); + Ok(()) +} + #[tokio::test] async fn drop_columns() -> Result<()> { // build plan using Table API @@ -542,6 +592,54 @@ async fn drop_with_periods() -> Result<()> { Ok(()) } +#[tokio::test] +async fn drop_columns_duplicated_names_from_different_qualifiers() -> Result<()> { + let t1 = test_table_with_name("t1") + .await? + .select_columns(&["c1"])? + .limit(0, Some(3))?; + let t2 = test_table_with_name("t2") + .await? + .select_columns(&["c1"])? + .limit(3, Some(3))?; + let t3 = test_table_with_name("t3") + .await? + .select_columns(&["c1"])? + .limit(6, Some(3))?; + + let join_res = t1 + .join(t2, JoinType::LeftMark, &["c1"], &["c1"], None)? + .join(t3, JoinType::LeftMark, &["c1"], &["c1"], None)?; + assert_snapshot!( + batches_to_sort_string(&join_res.clone().collect().await.unwrap()), + @r" + +----+-------+-------+ + | c1 | mark | mark | + +----+-------+-------+ + | b | true | false | + | c | false | false | + | d | false | true | + +----+-------+-------+ + " + ); + + let drop_res = join_res.drop_columns(&["mark"])?; + assert_snapshot!( + batches_to_sort_string(&drop_res.clone().collect().await.unwrap()), + @r" + +----+ + | c1 | + +----+ + | b | + | c | + | d | + +----+ + " + ); + + Ok(()) +} + #[tokio::test] async fn aggregate() -> Result<()> { // build plan using DataFrame API @@ -1577,6 +1675,23 @@ async fn register_table() -> Result<()> { Ok(()) } +#[tokio::test] +async fn register_temporary_table() -> Result<()> { + let df = test_table().await?.select_columns(&["c1", "c12"])?; + let ctx = SessionContext::new(); + let df_impl = DataFrame::new(ctx.state(), df.logical_plan().clone()); + + let df_table_provider = df_impl.clone().into_temporary_view(); + + // check that we set the correct table_type + assert_eq!(df_table_provider.table_type(), TableType::Temporary); + + // check that we can register a dataframe as a temporary table + ctx.register_table("test_table", df_table_provider)?; + + Ok(()) +} + /// Compare the formatted string representation of two plans for equality fn assert_same_plan(plan1: &LogicalPlan, plan2: &LogicalPlan) { assert_eq!(format!("{plan1:?}"), format!("{plan2:?}")); @@ -2749,10 +2864,9 @@ async fn test_count_wildcard_on_sort() -> Result<()> { | | ProjectionExec: expr=[b@0 as b, count(Int64(1))@1 as count(*), count(Int64(1))@1 as count(Int64(1))] | | | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(Int64(1))] | | | CoalesceBatchesExec: target_batch_size=8192 | - | | RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4 | - | | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | - | | AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(Int64(1))] | - | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=1 | + | | AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(Int64(1))] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | | | | +---------------+------------------------------------------------------------------------------------------------------------+ "### @@ -2761,22 +2875,21 @@ async fn test_count_wildcard_on_sort() -> Result<()> { assert_snapshot!( pretty_format_batches(&df_results).unwrap(), @r###" - +---------------+--------------------------------------------------------------------------------+ - | plan_type | plan | - +---------------+--------------------------------------------------------------------------------+ - | logical_plan | Sort: count(*) ASC NULLS LAST | - | | Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1)) AS count(*)]] | - | | TableScan: t1 projection=[b] | - | physical_plan | SortPreservingMergeExec: [count(*)@1 ASC NULLS LAST] | - | | SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true] | - | | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(*)] | - | | CoalesceBatchesExec: target_batch_size=8192 | - | | RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4 | - | | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | - | | AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(*)] | - | | DataSourceExec: partitions=1, partition_sizes=[1] | - | | | - +---------------+--------------------------------------------------------------------------------+ + +---------------+----------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+----------------------------------------------------------------------------+ + | logical_plan | Sort: count(*) ASC NULLS LAST | + | | Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1)) AS count(*)]] | + | | TableScan: t1 projection=[b] | + | physical_plan | SortPreservingMergeExec: [count(*)@1 ASC NULLS LAST] | + | | SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true] | + | | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(*)] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=1 | + | | AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(*)] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+----------------------------------------------------------------------------+ "### ); Ok(()) @@ -2944,18 +3057,18 @@ async fn test_count_wildcard_on_window() -> Result<()> { assert_snapshot!( pretty_format_batches(&sql_results).unwrap(), @r#" - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | plan_type | plan | - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING AS count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING | - | | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] | - | | TableScan: t1 projection=[a] | - | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] | - | | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { name: "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING], mode=[Sorted] | - | | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] | - | | DataSourceExec: partitions=1, partition_sizes=[1] | - | | | - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING AS count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING | + | | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] | + | | TableScan: t1 projection=[a] | + | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] | + | | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING": Int64 }, frame: RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING], mode=[Sorted] | + | | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ "# ); @@ -2978,24 +3091,174 @@ async fn test_count_wildcard_on_window() -> Result<()> { assert_snapshot!( pretty_format_batches(&df_results).unwrap(), @r#" - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | plan_type | plan | - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING | - | | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] | - | | TableScan: t1 projection=[a] | - | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] | - | | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { name: "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING], mode=[Sorted] | - | | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] | - | | DataSourceExec: partitions=1, partition_sizes=[1] | - | | | - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING | + | | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] | + | | TableScan: t1 projection=[a] | + | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] | + | | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING": Int64 }, frame: RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING], mode=[Sorted] | + | | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ "# ); Ok(()) } +#[tokio::test] +// Test with `repartition_sorts` disabled, causing a full resort of the data +async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_repartition_sorts_false( +) -> Result<()> { + assert_snapshot!( + union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl(false).await?, + @r#" + AggregateExec: mode=Final, gby=[id@0 as id], aggr=[], ordering_mode=Sorted + SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] + CoalescePartitionsExec + AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[] + UnionExec + DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet + DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], file_type=parquet + "#); + Ok(()) +} + +#[ignore] // See https://github.com/apache/datafusion/issues/18380 +#[tokio::test] +// Test with `repartition_sorts` enabled to preserve pre-sorted partitions and avoid resorting +async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_repartition_sorts_true( +) -> Result<()> { + assert_snapshot!( + union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl(true).await?, + @r#" + AggregateExec: mode=Final, gby=[id@0 as id], aggr=[], ordering_mode=Sorted + SortPreservingMergeExec: [id@0 ASC NULLS LAST] + AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[], ordering_mode=Sorted + UnionExec + DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet + SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], file_type=parquet + "#); + + // 💥 Doesn't pass, and generates this plan: + // + // AggregateExec: mode=Final, gby=[id@0 as id], aggr=[], ordering_mode=Sorted + // SortPreservingMergeExec: [id@0 ASC NULLS LAST] + // SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] + // AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[] + // UnionExec + // DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet + // DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], file_type=parquet + // + // + // === Excerpt from the verbose explain === + // + // +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + // | plan_type | plan | + // +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + // | initial_physical_plan | AggregateExec: mode=Final, gby=[id@0 as id], aggr=[], ordering_mode=Sorted | + // | | AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[], ordering_mode=Sorted | + // | | UnionExec | + // | | DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet | + // | | SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] | + // | | DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], file_type=parquet | + // ... + // | physical_plan after EnforceDistribution | OutputRequirementExec: order_by=[], dist_by=Unspecified | + // | | AggregateExec: mode=Final, gby=[id@0 as id], aggr=[], ordering_mode=Sorted | + // | | SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] | + // | | CoalescePartitionsExec | + // | | AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[], ordering_mode=Sorted | + // | | UnionExec | + // | | DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet | + // | | SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] | + // | | DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], file_type=parquet | + // | | | + // | physical_plan after CombinePartialFinalAggregate | SAME TEXT AS ABOVE + // | | | + // | physical_plan after EnforceSorting | OutputRequirementExec: order_by=[], dist_by=Unspecified | + // | | AggregateExec: mode=Final, gby=[id@0 as id], aggr=[], ordering_mode=Sorted | + // | | SortPreservingMergeExec: [id@0 ASC NULLS LAST] | + // | | SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] | + // | | AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[] | + // | | UnionExec | + // | | DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet | + // | | DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], file_type=parquet | + // ... + // +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + + Ok(()) +} + +async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl( + repartition_sorts: bool, +) -> Result { + let config = SessionConfig::default() + .with_target_partitions(1) + .with_repartition_sorts(repartition_sorts); + let ctx = SessionContext::new_with_config(config); + + let testdata = parquet_test_data(); + + // Register "sorted" table, that is sorted + ctx.register_parquet( + "sorted", + &format!("{testdata}/alltypes_tiny_pages.parquet"), + ParquetReadOptions::default() + .file_sort_order(vec![vec![col("id").sort(true, false)]]), + ) + .await?; + + // Register "unsorted" table + ctx.register_parquet( + "unsorted", + &format!("{testdata}/alltypes_tiny_pages.parquet"), + ParquetReadOptions::default(), + ) + .await?; + + let source_sorted = ctx + .table("sorted") + .await + .unwrap() + .select(vec![col("id")]) + .unwrap(); + + let source_unsorted = ctx + .table("unsorted") + .await + .unwrap() + .select(vec![col("id")]) + .unwrap(); + + let source_unsorted_resorted = + source_unsorted.sort(vec![col("id").sort(true, false)])?; + + let union = source_sorted.union(source_unsorted_resorted)?; + + let agg = union.aggregate(vec![col("id")], vec![])?; + + let df = agg; + + // To be able to remove user specific paths from the plan, for stable assertions + let testdata_clean = Path::new(&testdata).canonicalize()?.display().to_string(); + let testdata_clean = testdata_clean.strip_prefix("/").unwrap_or(&testdata_clean); + + // Use displayable() rather than explain().collect() to avoid table formatting issues. We need + // to replace machine-specific paths with variable lengths, which breaks table alignment and + // causes snapshot mismatches. + let physical_plan = df.create_physical_plan().await?; + let displayable_plan = displayable(physical_plan.as_ref()) + .indent(true) + .to_string() + .replace(testdata_clean, "{testdata}"); + + Ok(displayable_plan) +} + #[tokio::test] async fn test_count_wildcard_on_aggregate() -> Result<()> { let ctx = create_join_context()?; @@ -3086,10 +3349,9 @@ async fn test_count_wildcard_on_where_scalar_subquery() -> Result<()> { | | ProjectionExec: expr=[count(Int64(1))@1 as count(*), a@0 as a, true as __always_true] | | | AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(Int64(1))] | | | CoalesceBatchesExec: target_batch_size=8192 | - | | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 | - | | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | - | | AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))] | - | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1 | + | | AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | | | | +---------------+---------------------------------------------------------------------------------------------------------------------------+ " @@ -3143,10 +3405,9 @@ async fn test_count_wildcard_on_where_scalar_subquery() -> Result<()> { | | ProjectionExec: expr=[count(*)@1 as count(*), a@0 as a, true as __always_true] | | | AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(*)] | | | CoalesceBatchesExec: target_batch_size=8192 | - | | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 | - | | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | - | | AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(*)] | - | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1 | + | | AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(*)] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | | | | +---------------+---------------------------------------------------------------------------------------------------------------------------+ " @@ -4435,12 +4696,12 @@ async fn unnest_with_redundant_columns() -> Result<()> { let actual = formatted.trim(); assert_snapshot!( actual, - @r###" + @r" Projection: shapes.shape_id [shape_id:UInt32] Unnest: lists[shape_id2|depth=1] structs[] [shape_id:UInt32, shape_id2:UInt32;N] - Aggregate: groupBy=[[shapes.shape_id]], aggr=[[array_agg(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(Field { name: "item", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[shapes.shape_id]], aggr=[[array_agg(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(Field { data_type: UInt32, nullable: true });N] TableScan: shapes projection=[shape_id] [shape_id:UInt32] - "### + " ); let results = df.collect().await?; @@ -6311,7 +6572,7 @@ async fn test_copy_schema() -> Result<()> { let target_path = tmp_dir.path().join("target.csv"); let query = format!( - "COPY source_table TO '{:?}' STORED AS csv", + "COPY source_table TO '{}' STORED AS csv", target_path.to_str().unwrap() ); @@ -6459,10 +6720,10 @@ async fn test_duplicate_state_fields_for_dfschema_construct() -> Result<()> { "ticker", "first_value(value)[first_value]", "timestamp@0", - "is_set", + "first_value(value)[first_value_is_set]", "last_value(value)[last_value]", "timestamp@0", - "is_set", + "last_value(value)[last_value_is_set]", ]; let binding = partial_agg.schema(); diff --git a/datafusion/core/tests/datasource/object_store_access.rs b/datafusion/core/tests/datasource/object_store_access.rs index 6b9585f408a1..33129150db58 100644 --- a/datafusion/core/tests/datasource/object_store_access.rs +++ b/datafusion/core/tests/datasource/object_store_access.rs @@ -27,7 +27,10 @@ use arrow::array::{ArrayRef, Int32Array, RecordBatch}; use async_trait::async_trait; use bytes::Bytes; -use datafusion::prelude::{CsvReadOptions, SessionContext}; +use datafusion::prelude::{CsvReadOptions, ParquetReadOptions, SessionContext}; +use datafusion_catalog_listing::{ListingOptions, ListingTable, ListingTableConfig}; +use datafusion_datasource::ListingTableUrl; +use datafusion_datasource_csv::CsvFormat; use futures::stream::BoxStream; use insta::assert_snapshot; use object_store::memory::InMemory; @@ -45,8 +48,9 @@ use url::Url; #[tokio::test] async fn create_single_csv_file() { + let test = Test::new().with_single_file_csv().await; assert_snapshot!( - single_file_csv_test().await.requests(), + test.requests(), @r" RequestCountingObjectStore() Total Requests: 2 @@ -58,8 +62,9 @@ async fn create_single_csv_file() { #[tokio::test] async fn query_single_csv_file() { + let test = Test::new().with_single_file_csv().await; assert_snapshot!( - single_file_csv_test().await.query("select * from csv_table").await, + test.query("select * from csv_table").await, @r" ------- Query Output (2 rows) ------- +---------+-------+-------+ @@ -79,8 +84,9 @@ async fn query_single_csv_file() { #[tokio::test] async fn create_multi_file_csv_file() { + let test = Test::new().with_multi_file_csv().await; assert_snapshot!( - multi_file_csv_test().await.requests(), + test.requests(), @r" RequestCountingObjectStore() Total Requests: 4 @@ -94,8 +100,9 @@ async fn create_multi_file_csv_file() { #[tokio::test] async fn query_multi_csv_file() { + let test = Test::new().with_multi_file_csv().await; assert_snapshot!( - multi_file_csv_test().await.query("select * from csv_table").await, + test.query("select * from csv_table").await, @r" ------- Query Output (6 rows) ------- +---------+-------+-------+ @@ -120,24 +127,250 @@ async fn query_multi_csv_file() { } #[tokio::test] -async fn create_single_parquet_file() { +async fn query_partitioned_csv_file() { + let test = Test::new().with_partitioned_csv().await; assert_snapshot!( - single_file_parquet_test().await.requests(), + test.query("select * from csv_table_partitioned").await, + @r" + ------- Query Output (6 rows) ------- + +---------+-------+-------+---+----+-----+ + | d1 | d2 | d3 | a | b | c | + +---------+-------+-------+---+----+-----+ + | 0.00001 | 1e-12 | true | 1 | 10 | 100 | + | 0.00003 | 5e-12 | false | 1 | 10 | 100 | + | 0.00002 | 2e-12 | true | 2 | 20 | 200 | + | 0.00003 | 5e-12 | false | 2 | 20 | 200 | + | 0.00003 | 3e-12 | true | 3 | 30 | 300 | + | 0.00003 | 5e-12 | false | 3 | 30 | 300 | + +---------+-------+-------+---+----+-----+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 4 + - LIST prefix=data + - GET (opts) path=data/a=1/b=10/c=100/file_1.csv + - GET (opts) path=data/a=2/b=20/c=200/file_2.csv + - GET (opts) path=data/a=3/b=30/c=300/file_3.csv + " + ); + + assert_snapshot!( + test.query("select * from csv_table_partitioned WHERE a=2").await, + @r" + ------- Query Output (2 rows) ------- + +---------+-------+-------+---+----+-----+ + | d1 | d2 | d3 | a | b | c | + +---------+-------+-------+---+----+-----+ + | 0.00002 | 2e-12 | true | 2 | 20 | 200 | + | 0.00003 | 5e-12 | false | 2 | 20 | 200 | + +---------+-------+-------+---+----+-----+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 2 + - LIST prefix=data + - GET (opts) path=data/a=2/b=20/c=200/file_2.csv + " + ); + + assert_snapshot!( + test.query("select * from csv_table_partitioned WHERE b=20").await, + @r" + ------- Query Output (2 rows) ------- + +---------+-------+-------+---+----+-----+ + | d1 | d2 | d3 | a | b | c | + +---------+-------+-------+---+----+-----+ + | 0.00002 | 2e-12 | true | 2 | 20 | 200 | + | 0.00003 | 5e-12 | false | 2 | 20 | 200 | + +---------+-------+-------+---+----+-----+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 2 + - LIST prefix=data + - GET (opts) path=data/a=2/b=20/c=200/file_2.csv + " + ); + + assert_snapshot!( + test.query("select * from csv_table_partitioned WHERE c=200").await, + @r" + ------- Query Output (2 rows) ------- + +---------+-------+-------+---+----+-----+ + | d1 | d2 | d3 | a | b | c | + +---------+-------+-------+---+----+-----+ + | 0.00002 | 2e-12 | true | 2 | 20 | 200 | + | 0.00003 | 5e-12 | false | 2 | 20 | 200 | + +---------+-------+-------+---+----+-----+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 2 + - LIST prefix=data + - GET (opts) path=data/a=2/b=20/c=200/file_2.csv + " + ); + + assert_snapshot!( + test.query("select * from csv_table_partitioned WHERE a=2 AND b=20").await, + @r" + ------- Query Output (2 rows) ------- + +---------+-------+-------+---+----+-----+ + | d1 | d2 | d3 | a | b | c | + +---------+-------+-------+---+----+-----+ + | 0.00002 | 2e-12 | true | 2 | 20 | 200 | + | 0.00003 | 5e-12 | false | 2 | 20 | 200 | + +---------+-------+-------+---+----+-----+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 2 + - LIST prefix=data + - GET (opts) path=data/a=2/b=20/c=200/file_2.csv + " + ); + + assert_snapshot!( + test.query("select * from csv_table_partitioned WHERE a<2 AND b=10 AND c=100").await, + @r" + ------- Query Output (2 rows) ------- + +---------+-------+-------+---+----+-----+ + | d1 | d2 | d3 | a | b | c | + +---------+-------+-------+---+----+-----+ + | 0.00001 | 1e-12 | true | 1 | 10 | 100 | + | 0.00003 | 5e-12 | false | 1 | 10 | 100 | + +---------+-------+-------+---+----+-----+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 2 + - LIST prefix=data + - GET (opts) path=data/a=1/b=10/c=100/file_1.csv + " + ); +} + +#[tokio::test] +async fn create_single_parquet_file_default() { + // The default metadata size hint is 512KB + // which is enough to fetch the entire footer metadata and PageIndex + // in a single GET request. + let test = Test::new().with_single_file_parquet().await; + // expect 1 get request which reads the footer metadata and page index + assert_snapshot!( + test.requests(), + @r" + RequestCountingObjectStore() + Total Requests: 2 + - HEAD path=parquet_table.parquet + - GET (range) range=0-2994 path=parquet_table.parquet + " + ); +} + +#[tokio::test] +async fn create_single_parquet_file_prefetch() { + // Explicitly specify a prefetch hint that is adequate for the footer and page index + let test = Test::new() + .with_parquet_metadata_size_hint(Some(1000)) + .with_single_file_parquet() + .await; + // expect 1 1000 byte request which reads the footer metadata and page index + assert_snapshot!( + test.requests(), + @r" + RequestCountingObjectStore() + Total Requests: 2 + - HEAD path=parquet_table.parquet + - GET (range) range=1994-2994 path=parquet_table.parquet + " + ); +} + +#[tokio::test] +async fn create_single_parquet_file_too_small_prefetch() { + // configure a prefetch size that is too small to fetch the footer + // metadata + // + // Using the ranges from the test below (with no_prefetch), + // pick a number less than 730: + // -------- + // 2286-2294: (8 bytes) footer + length + // 2264-2986: (722 bytes) footer metadata + let test = Test::new() + .with_parquet_metadata_size_hint(Some(500)) + .with_single_file_parquet() + .await; + // expect three get requests: + // 1. read the footer (500 bytes per hint, not enough for the footer metadata) + // 2. Read the footer metadata + // 3. reads the PageIndex + assert_snapshot!( + test.requests(), @r" RequestCountingObjectStore() Total Requests: 4 - HEAD path=parquet_table.parquet - - GET (range) range=2986-2994 path=parquet_table.parquet + - GET (range) range=2494-2994 path=parquet_table.parquet - GET (range) range=2264-2986 path=parquet_table.parquet - GET (range) range=2124-2264 path=parquet_table.parquet " ); } +#[tokio::test] +async fn create_single_parquet_file_small_prefetch() { + // configure a prefetch size that is large enough for the footer + // metadata but **not** the PageIndex + // + // Using the ranges from the test below (with no_prefetch), + // the 730 is determined as follows; + // -------- + // 2286-2294: (8 bytes) footer + length + // 2264-2986: (722 bytes) footer metadata + let test = Test::new() + // 740 is enough to get both the footer + length (8 bytes) + // but not the entire PageIndex + .with_parquet_metadata_size_hint(Some(740)) + .with_single_file_parquet() + .await; + // expect two get requests: + // 1. read the footer metadata + // 2. reads the PageIndex + assert_snapshot!( + test.requests(), + @r" + RequestCountingObjectStore() + Total Requests: 3 + - HEAD path=parquet_table.parquet + - GET (range) range=2254-2994 path=parquet_table.parquet + - GET (range) range=2124-2264 path=parquet_table.parquet + " + ); +} + +#[tokio::test] +async fn create_single_parquet_file_no_prefetch() { + let test = Test::new() + // force no prefetch by setting size hint to None + .with_parquet_metadata_size_hint(None) + .with_single_file_parquet() + .await; + // Without a metadata size hint, the parquet reader + // does *three* range requests to read the footer metadata: + // 1. The footer length (last 8 bytes) + // 2. The footer metadata + // 3. The PageIndex metadata + assert_snapshot!( + test.requests(), + @r" + RequestCountingObjectStore() + Total Requests: 2 + - HEAD path=parquet_table.parquet + - GET (range) range=0-2994 path=parquet_table.parquet + " + ); +} + #[tokio::test] async fn query_single_parquet_file() { + let test = Test::new().with_single_file_parquet().await; assert_snapshot!( - single_file_parquet_test().await.query("select count(distinct a), count(b) from parquet_table").await, + test.query("select count(distinct a), count(b) from parquet_table").await, @r" ------- Query Output (1 rows) ------- +---------------------------------+------------------------+ @@ -157,10 +390,11 @@ async fn query_single_parquet_file() { #[tokio::test] async fn query_single_parquet_file_with_single_predicate() { + let test = Test::new().with_single_file_parquet().await; // Note that evaluating predicates requires additional object store requests // (to evaluate predicates) assert_snapshot!( - single_file_parquet_test().await.query("select min(a), max(b) from parquet_table WHERE a > 150").await, + test.query("select min(a), max(b) from parquet_table WHERE a > 150").await, @r" ------- Query Output (1 rows) ------- +----------------------+----------------------+ @@ -179,10 +413,12 @@ async fn query_single_parquet_file_with_single_predicate() { #[tokio::test] async fn query_single_parquet_file_multi_row_groups_multiple_predicates() { + let test = Test::new().with_single_file_parquet().await; + // Note that evaluating predicates requires additional object store requests // (to evaluate predicates) assert_snapshot!( - single_file_parquet_test().await.query("select min(a), max(b) from parquet_table WHERE a > 50 AND b < 1150").await, + test.query("select min(a), max(b) from parquet_table WHERE a > 50 AND b < 1150").await, @r" ------- Query Output (1 rows) ------- +----------------------+----------------------+ @@ -200,75 +436,16 @@ async fn query_single_parquet_file_multi_row_groups_multiple_predicates() { ); } -/// Create a test with a single CSV file with three columns and two rows -async fn single_file_csv_test() -> Test { - // upload CSV data to object store - let csv_data = r#"c1,c2,c3 -0.00001,5e-12,true -0.00002,4e-12,false -"#; - - Test::new() - .with_bytes("/csv_table.csv", csv_data) - .await - .register_csv("csv_table", "/csv_table.csv") - .await -} - -/// Create a test with three CSV files in a directory -async fn multi_file_csv_test() -> Test { - let mut test = Test::new(); - // upload CSV data to object store - for i in 0..3 { - let csv_data1 = format!( - r#"c1,c2,c3 -0.0000{i},{i}e-12,true -0.00003,5e-12,false -"# - ); - test = test - .with_bytes(&format!("/data/file_{i}.csv"), csv_data1) - .await; - } - // register table - test.register_csv("csv_table", "/data/").await -} - -/// Create a test with a single parquet file that has two -/// columns and two row groups -/// -/// Column "a": Int32 with values 0-100] in row group 1 -/// and [101-200] in row group 2 -/// -/// Column "b": Int32 with values 1000-1100] in row group 1 -/// and [1101-1200] in row group 2 -async fn single_file_parquet_test() -> Test { - // Create parquet bytes - let a: ArrayRef = Arc::new(Int32Array::from_iter_values(0..200)); - let b: ArrayRef = Arc::new(Int32Array::from_iter_values(1000..1200)); - let batch = RecordBatch::try_from_iter([("a", a), ("b", b)]).unwrap(); - - let mut buffer = vec![]; - let props = parquet::file::properties::WriterProperties::builder() - .set_max_row_group_size(100) - .build(); - let mut writer = - parquet::arrow::ArrowWriter::try_new(&mut buffer, batch.schema(), Some(props)) - .unwrap(); - writer.write(&batch).unwrap(); - writer.close().unwrap(); - - Test::new() - .with_bytes("/parquet_table.parquet", buffer) - .await - .register_parquet("parquet_table", "/parquet_table.parquet") - .await -} - /// Runs tests with a request counting object store struct Test { object_store: Arc, session_context: SessionContext, + /// metadata size hint to use when registering parquet files + /// + /// * `None`: uses the default (does not set a size_hint) + /// * `Some(None)`L: set prefetch hint to None (prefetching) + /// * `Some(Some(size))`: set prefetch hint to size + parquet_metadata_size_hint: Option>, } impl Test { @@ -281,9 +458,16 @@ impl Test { Self { object_store, session_context, + parquet_metadata_size_hint: None, } } + /// Specify the metadata size hint to use when registering parquet files + fn with_parquet_metadata_size_hint(mut self, size_hint: Option) -> Self { + self.parquet_metadata_size_hint = Some(size_hint); + self + } + /// Returns a string representation of all recorded requests thus far fn requests(&self) -> String { format!("{}", self.object_store) @@ -300,7 +484,7 @@ impl Test { self } - /// Register a CSV file at the given path relative to the [`datafusion_test_data`] directory + /// Register a CSV file at the given path async fn register_csv(self, table_name: &str, path: &str) -> Self { let mut options = CsvReadOptions::new(); options.has_header = true; @@ -312,16 +496,133 @@ impl Test { self } - /// Register a CSV file at the given path relative to the [`datafusion_test_data`] directory + /// Register a partitioned CSV table at the given path + async fn register_partitioned_csv(self, table_name: &str, path: &str) -> Self { + let file_format = Arc::new(CsvFormat::default().with_has_header(true)); + let options = ListingOptions::new(file_format); + + let url = format!("mem://{path}").parse().unwrap(); + let table_url = ListingTableUrl::try_new(url, None).unwrap(); + + let session_state = self.session_context.state(); + let mut config = ListingTableConfig::new(table_url).with_listing_options(options); + config = config + .infer_partitions_from_path(&session_state) + .await + .unwrap(); + config = config.infer_schema(&session_state).await.unwrap(); + + let table = Arc::new(ListingTable::try_new(config).unwrap()); + self.session_context + .register_table(table_name, table) + .unwrap(); + self + } + + /// Register a Parquet file at the given path async fn register_parquet(self, table_name: &str, path: &str) -> Self { let path = format!("mem://{path}"); + let mut options: ParquetReadOptions<'_> = ParquetReadOptions::new(); + + // If a metadata size hint was specified, apply it + if let Some(parquet_metadata_size_hint) = self.parquet_metadata_size_hint { + options = options.metadata_size_hint(parquet_metadata_size_hint); + } + self.session_context - .register_parquet(table_name, path, Default::default()) + .register_parquet(table_name, path, options) .await .unwrap(); self } + /// Register a single CSV file with three columns and two row named + /// `csv_table` + async fn with_single_file_csv(self) -> Test { + // upload CSV data to object store + let csv_data = r#"c1,c2,c3 +0.00001,5e-12,true +0.00002,4e-12,false +"#; + self.with_bytes("/csv_table.csv", csv_data) + .await + .register_csv("csv_table", "/csv_table.csv") + .await + } + + /// Register three CSV files in a directory, called `csv_table` + async fn with_multi_file_csv(mut self) -> Test { + // upload CSV data to object store + for i in 0..3 { + let csv_data1 = format!( + r#"c1,c2,c3 +0.0000{i},{i}e-12,true +0.00003,5e-12,false +"# + ); + self = self + .with_bytes(&format!("/data/file_{i}.csv"), csv_data1) + .await; + } + // register table + self.register_csv("csv_table", "/data/").await + } + + /// Register three CSV files in a partitioned directory structure, called + /// `csv_table_partitioned` + async fn with_partitioned_csv(mut self) -> Test { + for i in 1..4 { + // upload CSV data to object store + let csv_data1 = format!( + r#"d1,d2,d3 +0.0000{i},{i}e-12,true +0.00003,5e-12,false +"# + ); + self = self + .with_bytes( + &format!("/data/a={i}/b={}/c={}/file_{i}.csv", i * 10, i * 100,), + csv_data1, + ) + .await; + } + // register table + self.register_partitioned_csv("csv_table_partitioned", "/data/") + .await + } + + /// Add a single parquet file that has two columns and two row groups named `parquet_table` + /// + /// Column "a": Int32 with values 0-100] in row group 1 + /// and [101-200] in row group 2 + /// + /// Column "b": Int32 with values 1000-1100] in row group 1 + /// and [1101-1200] in row group 2 + async fn with_single_file_parquet(self) -> Test { + // Create parquet bytes + let a: ArrayRef = Arc::new(Int32Array::from_iter_values(0..200)); + let b: ArrayRef = Arc::new(Int32Array::from_iter_values(1000..1200)); + let batch = RecordBatch::try_from_iter([("a", a), ("b", b)]).unwrap(); + + let mut buffer = vec![]; + let props = parquet::file::properties::WriterProperties::builder() + .set_max_row_group_size(100) + .build(); + let mut writer = parquet::arrow::ArrowWriter::try_new( + &mut buffer, + batch.schema(), + Some(props), + ) + .unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + self.with_bytes("/parquet_table.parquet", buffer) + .await + .register_parquet("parquet_table", "/parquet_table.parquet") + .await + } + /// Runs the specified query and returns a string representation of the results /// suitable for comparison with insta snapshots /// diff --git a/datafusion/core/tests/execution/mod.rs b/datafusion/core/tests/execution/mod.rs index 8770b2a20105..f33ef87aa302 100644 --- a/datafusion/core/tests/execution/mod.rs +++ b/datafusion/core/tests/execution/mod.rs @@ -18,3 +18,4 @@ mod coop; mod datasource_split; mod logical_plan; +mod register_arrow; diff --git a/datafusion/core/tests/execution/register_arrow.rs b/datafusion/core/tests/execution/register_arrow.rs new file mode 100644 index 000000000000..4ce16dc0906c --- /dev/null +++ b/datafusion/core/tests/execution/register_arrow.rs @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Integration tests for register_arrow API + +use datafusion::{execution::options::ArrowReadOptions, prelude::*}; +use datafusion_common::Result; + +#[tokio::test] +async fn test_register_arrow_auto_detects_format() -> Result<()> { + let ctx = SessionContext::new(); + + ctx.register_arrow( + "file_format", + "../../datafusion/datasource-arrow/tests/data/example.arrow", + ArrowReadOptions::default(), + ) + .await?; + + ctx.register_arrow( + "stream_format", + "../../datafusion/datasource-arrow/tests/data/example_stream.arrow", + ArrowReadOptions::default(), + ) + .await?; + + let file_result = ctx.sql("SELECT * FROM file_format ORDER BY f0").await?; + let stream_result = ctx.sql("SELECT * FROM stream_format ORDER BY f0").await?; + + let file_batches = file_result.collect().await?; + let stream_batches = stream_result.collect().await?; + + assert_eq!(file_batches.len(), stream_batches.len()); + assert_eq!(file_batches[0].schema(), stream_batches[0].schema()); + + let file_rows: usize = file_batches.iter().map(|b| b.num_rows()).sum(); + let stream_rows: usize = stream_batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(file_rows, stream_rows); + + Ok(()) +} + +#[tokio::test] +async fn test_register_arrow_join_file_and_stream() -> Result<()> { + let ctx = SessionContext::new(); + + ctx.register_arrow( + "file_table", + "../../datafusion/datasource-arrow/tests/data/example.arrow", + ArrowReadOptions::default(), + ) + .await?; + + ctx.register_arrow( + "stream_table", + "../../datafusion/datasource-arrow/tests/data/example_stream.arrow", + ArrowReadOptions::default(), + ) + .await?; + + let result = ctx + .sql( + "SELECT a.f0, a.f1, b.f0, b.f1 + FROM file_table a + JOIN stream_table b ON a.f0 = b.f0 + WHERE a.f0 <= 2 + ORDER BY a.f0", + ) + .await?; + let batches = result.collect().await?; + + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 2); + + Ok(()) +} diff --git a/datafusion/core/tests/expr_api/simplification.rs b/datafusion/core/tests/expr_api/simplification.rs index 572a7e2b335c..46c36c6abdac 100644 --- a/datafusion/core/tests/expr_api/simplification.rs +++ b/datafusion/core/tests/expr_api/simplification.rs @@ -514,7 +514,7 @@ fn multiple_now() -> Result<()> { // expect the same timestamp appears in both exprs let actual = get_optimized_plan_formatted(plan, &time); let expected = format!( - "Projection: TimestampNanosecond({}, Some(\"+00:00\")) AS now(), TimestampNanosecond({}, Some(\"+00:00\")) AS t2\n TableScan: test", + "Projection: TimestampNanosecond({}, None) AS now(), TimestampNanosecond({}, None) AS t2\n TableScan: test", time.timestamp_nanos_opt().unwrap(), time.timestamp_nanos_opt().unwrap() ); diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs index 2abfcd8417cb..fa8ea0b31c02 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs @@ -44,7 +44,6 @@ use crate::fuzz_cases::aggregation_fuzzer::data_generator::Dataset; /// - hint `sorted` or not /// - `spilling` or not (TODO, I think a special `MemoryPool` may be needed /// to support this) -/// pub struct SessionContextGenerator { /// Current testing dataset dataset: Arc, diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs index 753a74995d8f..aaf2d1b9bad4 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs @@ -39,7 +39,6 @@ use crate::fuzz_cases::record_batch_generator::{ColumnDescr, RecordBatchGenerato /// will generate one `base dataset` firstly. Then the `base dataset` will be sorted /// based on each `sort_key` respectively. And finally `len(sort_keys) + 1` datasets /// will be returned -/// #[derive(Debug, Clone)] pub struct DatasetGeneratorConfig { /// Descriptions of columns in datasets, it's `required` @@ -115,7 +114,6 @@ impl DatasetGeneratorConfig { /// /// - Split each batch to multiple batches which each sub-batch in has the randomly `rows num`, /// and this multiple batches will be used to create the `Dataset`. -/// pub struct DatasetGenerator { batch_generator: RecordBatchGenerator, sort_keys_set: Vec>, diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs index b90b3e5e32df..1a8ef278cc29 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs @@ -253,7 +253,6 @@ impl AggregationFuzzer { /// /// - `dataset_ref`, the input dataset, store it for error reported when found /// the inconsistency between the one for `ctx` and `expected results`. -/// struct AggregationFuzzTestTask { /// Generated session context in current test case ctx_with_params: SessionContextWithParams, diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs index 209278385b7b..766e2bedd74c 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs @@ -24,7 +24,7 @@ use rand::{rng, seq::SliceRandom, Rng}; /// Creates queries like /// ```sql /// SELECT AGG(..) FROM table_name GROUP BY -///``` +/// ``` #[derive(Debug, Default, Clone)] pub struct QueryBuilder { // =================================== @@ -95,7 +95,6 @@ pub struct QueryBuilder { /// More details can see [`GroupOrdering`]. /// /// [`GroupOrdering`]: datafusion_physical_plan::aggregates::order::GroupOrdering - /// dataset_sort_keys: Vec>, /// If we will also test the no grouping case like: @@ -103,7 +102,6 @@ pub struct QueryBuilder { /// ```text /// SELECT aggr FROM t; /// ``` - /// no_grouping: bool, // ==================================== diff --git a/datafusion/core/tests/fuzz_cases/equivalence/projection.rs b/datafusion/core/tests/fuzz_cases/equivalence/projection.rs index 69639b3e09fd..a72a1558b2e4 100644 --- a/datafusion/core/tests/fuzz_cases/equivalence/projection.rs +++ b/datafusion/core/tests/fuzz_cases/equivalence/projection.rs @@ -58,7 +58,7 @@ fn project_orderings_random() -> Result<()> { Operator::Plus, col("b", &test_schema)?, )) as Arc; - let proj_exprs = vec![ + let proj_exprs = [ (col("a", &test_schema)?, "a_new"), (col("b", &test_schema)?, "b_new"), (col("c", &test_schema)?, "c_new"), @@ -132,7 +132,7 @@ fn ordering_satisfy_after_projection_random() -> Result<()> { Operator::Plus, col("b", &test_schema)?, )) as Arc; - let proj_exprs = vec![ + let proj_exprs = [ (col("a", &test_schema)?, "a_new"), (col("b", &test_schema)?, "b_new"), (col("c", &test_schema)?, "c_new"), diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs index f8bd4dbc1a76..51ec8f03e5d2 100644 --- a/datafusion/core/tests/fuzz_cases/pruning.rs +++ b/datafusion/core/tests/fuzz_cases/pruning.rs @@ -276,13 +276,12 @@ async fn execute_with_predicate( ctx: &SessionContext, ) -> Vec { let parquet_source = if prune_stats { - ParquetSource::default().with_predicate(predicate.clone()) + ParquetSource::new(schema.clone()).with_predicate(predicate.clone()) } else { - ParquetSource::default() + ParquetSource::new(schema.clone()) }; let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("memory://").unwrap(), - schema.clone(), Arc::new(parquet_source), ) .with_file_group( diff --git a/datafusion/core/tests/optimizer/mod.rs b/datafusion/core/tests/optimizer/mod.rs index aec32d05624c..9b2a5596827d 100644 --- a/datafusion/core/tests/optimizer/mod.rs +++ b/datafusion/core/tests/optimizer/mod.rs @@ -287,7 +287,7 @@ fn test_nested_schema_nullability() { #[test] fn test_inequalities_non_null_bounded() { - let guarantees = vec![ + let guarantees = [ // x ∈ [1, 3] (not null) ( col("x"), diff --git a/datafusion/core/tests/parquet/custom_reader.rs b/datafusion/core/tests/parquet/custom_reader.rs index 3a1f06656236..0a147d15a6fd 100644 --- a/datafusion/core/tests/parquet/custom_reader.rs +++ b/datafusion/core/tests/parquet/custom_reader.rs @@ -80,7 +80,7 @@ async fn route_data_access_ops_to_parquet_file_reader_factory() { .collect(); let source = Arc::new( - ParquetSource::default() + ParquetSource::new(file_schema.clone()) // prepare the scan .with_parquet_file_reader_factory(Arc::new( InMemoryParquetFileReaderFactory(Arc::clone(&in_memory_object_store)), @@ -89,7 +89,6 @@ async fn route_data_access_ops_to_parquet_file_reader_factory() { let base_config = FileScanConfigBuilder::new( // just any url that doesn't point to in memory object store ObjectStoreUrl::local_filesystem(), - file_schema, source, ) .with_file_group(file_group) diff --git a/datafusion/core/tests/parquet/encryption.rs b/datafusion/core/tests/parquet/encryption.rs index 819d8bf3a283..09b93f06ce85 100644 --- a/datafusion/core/tests/parquet/encryption.rs +++ b/datafusion/core/tests/parquet/encryption.rs @@ -314,7 +314,7 @@ async fn verify_file_encrypted( for col in row_group.columns() { assert!(matches!( col.crypto_metadata(), - Some(ColumnCryptoMetaData::EncryptionWithFooterKey) + Some(ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY) )); } } @@ -336,7 +336,7 @@ impl EncryptionFactory for MockEncryptionFactory { config: &EncryptionFactoryOptions, _schema: &SchemaRef, file_path: &object_store::path::Path, - ) -> datafusion_common::Result> { + ) -> datafusion_common::Result>> { assert_eq!( config.options.get("test_key"), Some(&"test value".to_string()) @@ -353,7 +353,7 @@ impl EncryptionFactory for MockEncryptionFactory { &self, config: &EncryptionFactoryOptions, file_path: &object_store::path::Path, - ) -> datafusion_common::Result> { + ) -> datafusion_common::Result>> { assert_eq!( config.options.get("test_key"), Some(&"test value".to_string()) diff --git a/datafusion/core/tests/parquet/external_access_plan.rs b/datafusion/core/tests/parquet/external_access_plan.rs index a5397c5a397c..b35cb6e09cfb 100644 --- a/datafusion/core/tests/parquet/external_access_plan.rs +++ b/datafusion/core/tests/parquet/external_access_plan.rs @@ -33,7 +33,7 @@ use datafusion_common::{assert_contains, DFSchema}; use datafusion_datasource_parquet::{ParquetAccessPlan, RowGroupAccess}; use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_expr::{col, lit, Expr}; -use datafusion_physical_plan::metrics::MetricsSet; +use datafusion_physical_plan::metrics::{MetricValue, MetricsSet}; use datafusion_physical_plan::ExecutionPlan; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; @@ -178,12 +178,21 @@ async fn plan_and_filter() { .unwrap(); // Verify that row group pruning still happens for just that group - let row_groups_pruned_statistics = - metric_value(&parquet_metrics, "row_groups_pruned_statistics").unwrap(); - assert_eq!( - row_groups_pruned_statistics, 1, - "metrics : {parquet_metrics:#?}", - ); + let row_groups_pruned_statistics = parquet_metrics + .sum_by_name("row_groups_pruned_statistics") + .unwrap(); + if let MetricValue::PruningMetrics { + pruning_metrics, .. + } = row_groups_pruned_statistics + { + assert_eq!( + pruning_metrics.pruned(), + 1, + "metrics : {parquet_metrics:#?}", + ); + } else { + unreachable!("metrics `row_groups_pruned_statistics` should exist") + } } #[tokio::test] @@ -346,11 +355,11 @@ impl TestFull { let source = if let Some(predicate) = predicate { let df_schema = DFSchema::try_from(schema.clone())?; let predicate = ctx.create_physical_expr(predicate, &df_schema)?; - Arc::new(ParquetSource::default().with_predicate(predicate)) + Arc::new(ParquetSource::new(schema.clone()).with_predicate(predicate)) } else { - Arc::new(ParquetSource::default()) + Arc::new(ParquetSource::new(schema.clone())) }; - let config = FileScanConfigBuilder::new(object_store_url, schema.clone(), source) + let config = FileScanConfigBuilder::new(object_store_url, source) .with_file(partitioned_file) .build(); diff --git a/datafusion/core/tests/parquet/filter_pushdown.rs b/datafusion/core/tests/parquet/filter_pushdown.rs index b769fec7d372..966f25161397 100644 --- a/datafusion/core/tests/parquet/filter_pushdown.rs +++ b/datafusion/core/tests/parquet/filter_pushdown.rs @@ -29,7 +29,7 @@ use arrow::compute::concat_batches; use arrow::record_batch::RecordBatch; use datafusion::physical_plan::collect; -use datafusion::physical_plan::metrics::MetricsSet; +use datafusion::physical_plan::metrics::{MetricValue, MetricsSet}; use datafusion::prelude::{ col, lit, lit_timestamp_nano, Expr, ParquetReadOptions, SessionContext, }; @@ -563,9 +563,9 @@ impl<'a> TestCase<'a> { } }; - let page_index_rows_pruned = get_value(&metrics, "page_index_rows_pruned"); + let (page_index_rows_pruned, page_index_rows_matched) = + get_pruning_metrics(&metrics, "page_index_rows_pruned"); println!(" page_index_rows_pruned: {page_index_rows_pruned}"); - let page_index_rows_matched = get_value(&metrics, "page_index_rows_matched"); println!(" page_index_rows_matched: {page_index_rows_matched}"); let page_index_filtering_expected = if scan_options.enable_page_index { @@ -592,14 +592,29 @@ impl<'a> TestCase<'a> { } } +fn get_pruning_metrics(metrics: &MetricsSet, metric_name: &str) -> (usize, usize) { + match metrics.sum_by_name(metric_name) { + Some(MetricValue::PruningMetrics { + pruning_metrics, .. + }) => (pruning_metrics.pruned(), pruning_metrics.matched()), + Some(_) => { + panic!("Metric '{metric_name}' is not a pruning metric in\n\n{metrics:#?}") + } + None => panic!( + "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}" + ), + } +} + fn get_value(metrics: &MetricsSet, metric_name: &str) -> usize { match metrics.sum_by_name(metric_name) { + Some(MetricValue::PruningMetrics { + pruning_metrics, .. + }) => pruning_metrics.pruned(), Some(v) => v.as_usize(), - _ => { - panic!( - "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}" - ); - } + None => panic!( + "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}" + ), } } @@ -631,8 +646,8 @@ async fn predicate_cache_pushdown_default() -> datafusion_common::Result<()> { #[tokio::test] async fn predicate_cache_pushdown_disable() -> datafusion_common::Result<()> { - // Can disable the cache even with filter pushdown by setting the size to 0. In this case we - // expect the inner records are reported but no records are read from the cache + // Can disable the cache even with filter pushdown by setting the size to 0. + // This results in no records read from the cache and no metrics reported let mut config = SessionConfig::new(); config.options_mut().execution.parquet.pushdown_filters = true; config @@ -641,13 +656,10 @@ async fn predicate_cache_pushdown_disable() -> datafusion_common::Result<()> { .parquet .max_predicate_cache_size = Some(0); let ctx = SessionContext::new_with_config(config); + // Since the cache is disabled, there is no reporting or use of the cache PredicateCacheTest { - // file has 8 rows, which need to be read twice, one for filter, one for - // final output - expected_inner_records: 16, - // Expect this to 0 records read as the cache is disabled. However, it is - // non zero due to https://github.com/apache/arrow-rs/issues/8307 - expected_records: 3, + expected_inner_records: 0, + expected_records: 0, } .run(&ctx) .await diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index c44d14abd381..097600e45ead 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -37,6 +37,7 @@ use datafusion::{ prelude::{ParquetReadOptions, SessionConfig, SessionContext}, }; use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder}; +use datafusion_physical_plan::metrics::MetricValue; use parquet::arrow::ArrowWriter; use parquet::file::properties::{EnabledStatistics, WriterProperties}; use std::sync::Arc; @@ -125,9 +126,44 @@ struct TestOutput { impl TestOutput { /// retrieve the value of the named metric, if any fn metric_value(&self, metric_name: &str) -> Option { + if let Some((pruned, _matched)) = self.pruning_metric(metric_name) { + return Some(pruned); + } + self.parquet_metrics .sum(|metric| metric.value().name() == metric_name) - .map(|v| v.as_usize()) + .map(|v| match v { + MetricValue::PruningMetrics { + pruning_metrics, .. + } => pruning_metrics.pruned(), + _ => v.as_usize(), + }) + } + + fn pruning_metric(&self, metric_name: &str) -> Option<(usize, usize)> { + let mut total_pruned = 0; + let mut total_matched = 0; + let mut found = false; + + for metric in self.parquet_metrics.iter() { + let metric = metric.as_ref(); + if metric.value().name() == metric_name { + if let MetricValue::PruningMetrics { + pruning_metrics, .. + } = metric.value() + { + total_pruned += pruning_metrics.pruned(); + total_matched += pruning_metrics.matched(); + found = true; + } + } + } + + if found { + Some((total_pruned, total_matched)) + } else { + None + } } /// The number of times the pruning predicate evaluation errors @@ -135,47 +171,52 @@ impl TestOutput { self.metric_value("predicate_evaluation_errors") } - /// The number of row_groups matched by bloom filter - fn row_groups_matched_bloom_filter(&self) -> Option { - self.metric_value("row_groups_matched_bloom_filter") - } - - /// The number of row_groups pruned by bloom filter - fn row_groups_pruned_bloom_filter(&self) -> Option { - self.metric_value("row_groups_pruned_bloom_filter") + /// The number of row_groups pruned / matched by bloom filter + fn row_groups_bloom_filter(&self) -> Option<(usize, usize)> { + self.pruning_metric("row_groups_pruned_bloom_filter") } /// The number of row_groups matched by statistics fn row_groups_matched_statistics(&self) -> Option { - self.metric_value("row_groups_matched_statistics") + self.pruning_metric("row_groups_pruned_statistics") + .map(|(_pruned, matched)| matched) } /// The number of row_groups pruned by statistics fn row_groups_pruned_statistics(&self) -> Option { - self.metric_value("row_groups_pruned_statistics") + self.pruning_metric("row_groups_pruned_statistics") + .map(|(pruned, _matched)| pruned) } + /// Metric `files_ranges_pruned_statistics` tracks both pruned and matched count, + /// for testing purpose, here it only aggregate the `pruned` count. fn files_ranges_pruned_statistics(&self) -> Option { - self.metric_value("files_ranges_pruned_statistics") + self.pruning_metric("files_ranges_pruned_statistics") + .map(|(pruned, _matched)| pruned) } /// The number of row_groups matched by bloom filter or statistics + /// + /// E.g. starting with 10 row groups, statistics: 10 total -> 7 matched, bloom + /// filter: 7 total -> 3 matched, this function returns 3 for the final matched + /// count. fn row_groups_matched(&self) -> Option { - self.row_groups_matched_bloom_filter() - .zip(self.row_groups_matched_statistics()) - .map(|(a, b)| a + b) + self.row_groups_bloom_filter() + .map(|(_pruned, matched)| matched) } /// The number of row_groups pruned fn row_groups_pruned(&self) -> Option { - self.row_groups_pruned_bloom_filter() + self.row_groups_bloom_filter() + .map(|(pruned, _matched)| pruned) .zip(self.row_groups_pruned_statistics()) .map(|(a, b)| a + b) } /// The number of row pages pruned fn row_pages_pruned(&self) -> Option { - self.metric_value("page_index_rows_pruned") + self.pruning_metric("page_index_rows_pruned") + .map(|(pruned, _matched)| pruned) } fn description(&self) -> String { diff --git a/datafusion/core/tests/parquet/page_pruning.rs b/datafusion/core/tests/parquet/page_pruning.rs index 27bee10234b5..fb2a196b0aa6 100644 --- a/datafusion/core/tests/parquet/page_pruning.rs +++ b/datafusion/core/tests/parquet/page_pruning.rs @@ -81,12 +81,12 @@ async fn get_parquet_exec( let predicate = create_physical_expr(&filter, &df_schema, &execution_props).unwrap(); let source = Arc::new( - ParquetSource::default() + ParquetSource::new(schema.clone()) .with_predicate(predicate) .with_enable_page_index(true) .with_pushdown_filters(pushdown_filters), ); - let base_config = FileScanConfigBuilder::new(object_store_url, schema, source) + let base_config = FileScanConfigBuilder::new(object_store_url, source) .with_file(partitioned_file) .build(); diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs index 44409166d3ce..0411298055f2 100644 --- a/datafusion/core/tests/parquet/row_group_pruning.rs +++ b/datafusion/core/tests/parquet/row_group_pruning.rs @@ -133,13 +133,14 @@ impl RowGroupPruningTest { self.expected_files_pruned_by_statistics, "mismatched files_ranges_pruned_statistics", ); + let bloom_filter_metrics = output.row_groups_bloom_filter(); assert_eq!( - output.row_groups_matched_bloom_filter(), + bloom_filter_metrics.map(|(_pruned, matched)| matched), self.expected_row_group_matched_by_bloom_filter, "mismatched row_groups_matched_bloom_filter", ); assert_eq!( - output.row_groups_pruned_bloom_filter(), + bloom_filter_metrics.map(|(pruned, _matched)| pruned), self.expected_row_group_pruned_by_bloom_filter, "mismatched row_groups_pruned_bloom_filter", ); @@ -163,7 +164,7 @@ async fn prune_timestamps_nanos() { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(10) .test_row_group_prune() @@ -181,7 +182,7 @@ async fn prune_timestamps_micros() { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(10) .test_row_group_prune() @@ -199,7 +200,7 @@ async fn prune_timestamps_millis() { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(10) .test_row_group_prune() @@ -217,7 +218,7 @@ async fn prune_timestamps_seconds() { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(10) .test_row_group_prune() @@ -233,7 +234,7 @@ async fn prune_date32() { .with_matched_by_stats(Some(1)) .with_pruned_by_stats(Some(3)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(1)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(1) .test_row_group_prune() @@ -262,8 +263,9 @@ async fn prune_date64() { println!("{}", output.description()); // This should prune out groups without error assert_eq!(output.predicate_evaluation_errors(), Some(0)); - assert_eq!(output.row_groups_matched(), Some(1)); - assert_eq!(output.row_groups_pruned(), Some(3)); + // 'dates' table has 4 row groups, and only the first one is matched by the predicate + assert_eq!(output.row_groups_matched_statistics(), Some(1)); + assert_eq!(output.row_groups_pruned_statistics(), Some(3)); assert_eq!(output.result_rows, 1, "{}", output.description()); } @@ -276,7 +278,7 @@ async fn prune_disabled() { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(10) .test_row_group_prune() @@ -296,7 +298,7 @@ async fn prune_disabled() { // This should not prune any assert_eq!(output.predicate_evaluation_errors(), Some(0)); - assert_eq!(output.row_groups_matched(), Some(0)); + assert_eq!(output.row_groups_matched(), Some(4)); assert_eq!(output.row_groups_pruned(), Some(0)); assert_eq!( output.result_rows, @@ -322,7 +324,7 @@ macro_rules! int_tests { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(11) .test_row_group_prune() @@ -337,7 +339,7 @@ macro_rules! int_tests { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(11) .test_row_group_prune() @@ -381,10 +383,10 @@ macro_rules! int_tests { .with_scenario(Scenario::Int) .with_query(&format!("SELECT * FROM t where abs(i{}) = 1", $bits)) .with_expected_errors(Some(0)) - .with_matched_by_stats(Some(0)) + .with_matched_by_stats(Some(4)) .with_pruned_by_stats(Some(0)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(4)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(3) .test_row_group_prune() @@ -397,10 +399,10 @@ macro_rules! int_tests { .with_scenario(Scenario::Int) .with_query(&format!("SELECT * FROM t where i{}+1 = 1", $bits)) .with_expected_errors(Some(0)) - .with_matched_by_stats(Some(0)) + .with_matched_by_stats(Some(4)) .with_pruned_by_stats(Some(0)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(4)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(2) .test_row_group_prune() @@ -413,10 +415,10 @@ macro_rules! int_tests { .with_scenario(Scenario::Int) .with_query(&format!("SELECT * FROM t where 1-i{} > 1", $bits)) .with_expected_errors(Some(0)) - .with_matched_by_stats(Some(0)) + .with_matched_by_stats(Some(4)) .with_pruned_by_stats(Some(0)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(4)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(9) .test_row_group_prune() @@ -498,7 +500,7 @@ macro_rules! uint_tests { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(11) .test_row_group_prune() @@ -542,10 +544,10 @@ macro_rules! uint_tests { .with_scenario(Scenario::UInt) .with_query(&format!("SELECT * FROM t where power(u{}, 2) = 25", $bits)) .with_expected_errors(Some(0)) - .with_matched_by_stats(Some(0)) + .with_matched_by_stats(Some(4)) .with_pruned_by_stats(Some(0)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(4)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(2) .test_row_group_prune() @@ -558,10 +560,10 @@ macro_rules! uint_tests { .with_scenario(Scenario::UInt) .with_query(&format!("SELECT * FROM t where u{}+1 = 6", $bits)) .with_expected_errors(Some(0)) - .with_matched_by_stats(Some(0)) + .with_matched_by_stats(Some(4)) .with_pruned_by_stats(Some(0)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(4)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(2) .test_row_group_prune() @@ -682,7 +684,7 @@ async fn prune_f64_lt() { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(11) .test_row_group_prune() @@ -694,7 +696,7 @@ async fn prune_f64_lt() { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(11) .test_row_group_prune() @@ -712,7 +714,7 @@ async fn prune_f64_scalar_fun_and_gt() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(2)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(1) .test_row_group_prune() @@ -726,10 +728,10 @@ async fn prune_f64_scalar_fun() { .with_scenario(Scenario::Float64) .with_query("SELECT * FROM t where abs(f-1) <= 0.000001") .with_expected_errors(Some(0)) - .with_matched_by_stats(Some(0)) + .with_matched_by_stats(Some(4)) .with_pruned_by_stats(Some(0)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(4)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(1) .test_row_group_prune() @@ -743,10 +745,10 @@ async fn prune_f64_complex_expr() { .with_scenario(Scenario::Float64) .with_query("SELECT * FROM t where f+1 > 1.1") .with_expected_errors(Some(0)) - .with_matched_by_stats(Some(0)) + .with_matched_by_stats(Some(4)) .with_pruned_by_stats(Some(0)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(4)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(9) .test_row_group_prune() @@ -760,10 +762,10 @@ async fn prune_f64_complex_expr_subtract() { .with_scenario(Scenario::Float64) .with_query("SELECT * FROM t where 1-f > 1") .with_expected_errors(Some(0)) - .with_matched_by_stats(Some(0)) + .with_matched_by_stats(Some(4)) .with_pruned_by_stats(Some(0)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(4)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(9) .test_row_group_prune() @@ -782,7 +784,7 @@ async fn prune_decimal_lt() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(6) .test_row_group_prune() @@ -794,7 +796,7 @@ async fn prune_decimal_lt() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(8) .test_row_group_prune() @@ -806,7 +808,7 @@ async fn prune_decimal_lt() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(6) .test_row_group_prune() @@ -818,7 +820,7 @@ async fn prune_decimal_lt() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(8) .test_row_group_prune() @@ -894,7 +896,7 @@ async fn prune_decimal_in_list() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(5) .test_row_group_prune() @@ -906,7 +908,7 @@ async fn prune_decimal_in_list() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(6) .test_row_group_prune() @@ -918,7 +920,7 @@ async fn prune_decimal_in_list() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(5) .test_row_group_prune() @@ -930,7 +932,7 @@ async fn prune_decimal_in_list() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(6) .test_row_group_prune() @@ -1064,7 +1066,7 @@ async fn prune_string_lt() { .with_matched_by_stats(Some(1)) .with_pruned_by_stats(Some(2)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(1)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(3) .test_row_group_prune() @@ -1079,7 +1081,7 @@ async fn prune_string_lt() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) // all backends from 'mixed' and 'all backends' .with_expected_rows(8) @@ -1172,7 +1174,7 @@ async fn prune_binary_lt() { .with_matched_by_stats(Some(1)) .with_pruned_by_stats(Some(2)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(1)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(3) .test_row_group_prune() @@ -1187,7 +1189,7 @@ async fn prune_binary_lt() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) // all backends from 'mixed' and 'all backends' .with_expected_rows(8) @@ -1279,7 +1281,7 @@ async fn prune_fixedsizebinary_lt() { .with_matched_by_stats(Some(1)) .with_pruned_by_stats(Some(2)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(1)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(2) .test_row_group_prune() @@ -1294,7 +1296,7 @@ async fn prune_fixedsizebinary_lt() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) // all backends from 'mixed' and 'all backends' .with_expected_rows(8) @@ -1362,7 +1364,7 @@ async fn test_row_group_with_null_values() { .with_pruned_files(Some(0)) .with_pruned_by_stats(Some(2)) .with_expected_rows(5) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(1)) .with_pruned_by_bloom_filter(Some(0)) .test_row_group_prune() .await; @@ -1376,7 +1378,7 @@ async fn test_row_group_with_null_values() { .with_pruned_files(Some(0)) .with_pruned_by_stats(Some(1)) .with_expected_rows(10) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .test_row_group_prune() .await; @@ -1390,7 +1392,7 @@ async fn test_row_group_with_null_values() { .with_pruned_files(Some(0)) .with_pruned_by_stats(Some(2)) .with_expected_rows(5) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(1)) .with_pruned_by_bloom_filter(Some(0)) .test_row_group_prune() .await; diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index 40fc6176e212..0e76d626aac5 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -482,7 +482,7 @@ fn test_apply_schema_adapter_with_factory() { ])); // Create a parquet source - let source = ParquetSource::default(); + let source = ParquetSource::new(schema.clone()); // Create a file scan config with source that has a schema adapter factory let factory = Arc::new(PrefixAdapterFactory { @@ -491,12 +491,9 @@ fn test_apply_schema_adapter_with_factory() { let file_source = source.clone().with_schema_adapter_factory(factory).unwrap(); - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - schema.clone(), - file_source, - ) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source) + .build(); // Apply schema adapter to a new source let result_source = source.apply_schema_adapter(&config).unwrap(); @@ -532,18 +529,15 @@ fn test_apply_schema_adapter_without_factory() { ])); // Create a parquet source - let source = ParquetSource::default(); + let source = ParquetSource::new(schema.clone()); // Convert to Arc let file_source: Arc = Arc::new(source.clone()); // Create a file scan config without a schema adapter factory - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - schema.clone(), - file_source, - ) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source) + .build(); // Apply schema adapter function - should pass through the source unchanged let result_source = source.apply_schema_adapter(&config).unwrap(); diff --git a/datafusion/core/tests/parquet/schema_coercion.rs b/datafusion/core/tests/parquet/schema_coercion.rs index 59cbf4b0872e..51e5242cbafd 100644 --- a/datafusion/core/tests/parquet/schema_coercion.rs +++ b/datafusion/core/tests/parquet/schema_coercion.rs @@ -62,14 +62,10 @@ async fn multi_parquet_coercion() { Field::new("c2", DataType::Int32, true), Field::new("c3", DataType::Float64, true), ])); - let source = Arc::new(ParquetSource::default()); - let conf = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - file_schema, - source, - ) - .with_file_group(file_group) - .build(); + let source = Arc::new(ParquetSource::new(file_schema.clone())); + let conf = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source) + .with_file_group(file_group) + .build(); let parquet_exec = DataSourceExec::from_data_source(conf); @@ -122,11 +118,10 @@ async fn multi_parquet_coercion_projection() { ])); let config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), - file_schema, - Arc::new(ParquetSource::default()), + Arc::new(ParquetSource::new(file_schema)), ) .with_file_group(file_group) - .with_projection(Some(vec![1, 0, 2])) + .with_projection_indices(Some(vec![1, 0, 2])) .build(); let parquet_exec = DataSourceExec::from_data_source(config); diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index 63111f43806b..aa5a2d053926 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -37,6 +37,7 @@ use datafusion::datasource::physical_plan::{CsvSource, ParquetSource}; use datafusion::datasource::source::DataSourceExec; use datafusion::datasource::MemTable; use datafusion::prelude::{SessionConfig, SessionContext}; +use datafusion_common::config::CsvOptions; use datafusion_common::error::Result; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::ScalarValue; @@ -66,9 +67,52 @@ use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr}; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use datafusion_physical_plan::union::UnionExec; use datafusion_physical_plan::{ - get_plan_string, DisplayAs, DisplayFormatType, ExecutionPlanProperties, - PlanProperties, Statistics, + displayable, DisplayAs, DisplayFormatType, ExecutionPlanProperties, PlanProperties, + Statistics, }; +use insta::Settings; + +/// Helper function to replace only the first occurrence of a regex pattern in a plan +/// Returns (captured_group_1, modified_string) +fn hide_first( + plan: &dyn ExecutionPlan, + regex: &str, + replacement: &str, +) -> (String, String) { + let plan_str = displayable(plan).indent(true).to_string(); + let pattern = regex::Regex::new(regex).unwrap(); + + if let Some(captures) = pattern.captures(&plan_str) { + let full_match = captures.get(0).unwrap(); + let captured_value = captures + .get(1) + .map(|m| m.as_str().to_string()) + .unwrap_or_default(); + let pos = full_match.start(); + let end_pos = full_match.end(); + let mut result = String::with_capacity(plan_str.len()); + result.push_str(&plan_str[..pos]); + result.push_str(replacement); + result.push_str(&plan_str[end_pos..]); + (captured_value, result) + } else { + (String::new(), plan_str) + } +} + +macro_rules! assert_plan { + ($plan: expr, @ $expected:literal) => { + insta::assert_snapshot!( + displayable($plan.as_ref()).indent(true).to_string(), + @ $expected + ) + }; + ($plan: expr, $another_plan: expr) => { + let plan1 = displayable($plan.as_ref()).indent(true).to_string(); + let plan2 = displayable($another_plan.as_ref()).indent(true).to_string(); + assert_eq!(plan1, plan2); + } +} /// Models operators like BoundedWindowExec that require an input /// ordering but is easy to construct @@ -186,8 +230,7 @@ fn parquet_exec_multiple_sorted( ) -> Arc { let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(ParquetSource::default()), + Arc::new(ParquetSource::new(schema())), ) .with_file_groups(vec![ FileGroup::new(vec![PartitionedFile::new("x".to_string(), 100)]), @@ -204,14 +247,19 @@ fn csv_exec() -> Arc { } fn csv_exec_with_sort(output_ordering: Vec) -> Arc { - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(CsvSource::new(false, b',', b'"')), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_output_ordering(output_ordering) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), { + let options = CsvOptions { + has_header: Some(false), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + Arc::new(CsvSource::new(schema()).with_csv_options(options)) + }) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .with_output_ordering(output_ordering) + .build(); DataSourceExec::from_data_source(config) } @@ -222,17 +270,22 @@ fn csv_exec_multiple() -> Arc { // Created a sorted parquet exec with multiple files fn csv_exec_multiple_sorted(output_ordering: Vec) -> Arc { - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(CsvSource::new(false, b',', b'"')), - ) - .with_file_groups(vec![ - FileGroup::new(vec![PartitionedFile::new("x".to_string(), 100)]), - FileGroup::new(vec![PartitionedFile::new("y".to_string(), 100)]), - ]) - .with_output_ordering(output_ordering) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), { + let options = CsvOptions { + has_header: Some(false), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + Arc::new(CsvSource::new(schema()).with_csv_options(options)) + }) + .with_file_groups(vec![ + FileGroup::new(vec![PartitionedFile::new("x".to_string(), 100)]), + FileGroup::new(vec![PartitionedFile::new("y".to_string(), 100)]), + ]) + .with_output_ordering(output_ordering) + .build(); DataSourceExec::from_data_source(config) } @@ -352,22 +405,6 @@ fn ensure_distribution_helper( ensure_distribution(distribution_context, &config).map(|item| item.data.plan) } -/// Test whether plan matches with expected plan -macro_rules! plans_matches_expected { - ($EXPECTED_LINES: expr, $PLAN: expr) => { - let physical_plan = $PLAN; - let actual = get_plan_string(&physical_plan); - - let expected_plan_lines: Vec<&str> = $EXPECTED_LINES - .iter().map(|s| *s).collect(); - - assert_eq!( - expected_plan_lines, actual, - "\n**Original Plan Mismatch\n\nexpected:\n\n{expected_plan_lines:#?}\nactual:\n\n{actual:#?}\n\n" - ); - } -} - fn test_suite_default_config_options() -> ConfigOptions { let mut config = ConfigOptions::new(); @@ -445,14 +482,11 @@ impl TestConfig { /// Perform a series of runs using the current [`TestConfig`], /// assert the expected plan result, /// and return the result plan (for potential subsequent runs). - fn run( + fn try_to_plan( &self, - expected_lines: &[&str], plan: Arc, optimizers_to_run: &[Run], ) -> Result> { - let expected_lines: Vec<&str> = expected_lines.to_vec(); - // Add the ancillary output requirements operator at the start: let optimizer = OutputRequirements::new_add_mode(); let mut optimized = optimizer.optimize(plan.clone(), &self.config)?; @@ -507,30 +541,16 @@ impl TestConfig { let optimizer = OutputRequirements::new_remove_mode(); let optimized = optimizer.optimize(optimized, &self.config)?; - // Now format correctly - let actual_lines = get_plan_string(&optimized); - - assert_eq!( - &expected_lines, &actual_lines, - "\n\nexpected:\n\n{expected_lines:#?}\nactual:\n\n{actual_lines:#?}\n\n" - ); - Ok(optimized) } -} -macro_rules! assert_plan_txt { - ($EXPECTED_LINES: expr, $PLAN: expr) => { - let expected_lines: Vec<&str> = $EXPECTED_LINES.iter().map(|s| *s).collect(); - // Now format correctly - let actual_lines = get_plan_string(&$PLAN); - - assert_eq!( - &expected_lines, &actual_lines, - "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", - expected_lines, actual_lines - ); - }; + fn to_plan( + &self, + plan: Arc, + optimizers_to_run: &[Run], + ) -> Arc { + self.try_to_plan(plan, optimizers_to_run).unwrap() + } } #[test] @@ -556,6 +576,8 @@ fn multi_hash_joins() -> Result<()> { JoinType::RightAnti, ]; + let settings = Settings::clone_current(); + // Join on (a == b1) let join_on = vec![( Arc::new(Column::new_with_schema("a", &schema()).unwrap()) as _, @@ -564,11 +586,17 @@ fn multi_hash_joins() -> Result<()> { for join_type in join_types { let join = hash_join_exec(left.clone(), right.clone(), &join_on, &join_type); - let join_plan = |shift| -> String { - format!("{}HashJoinExec: mode=Partitioned, join_type={join_type}, on=[(a@0, b1@1)]", " ".repeat(shift)) - }; - let join_plan_indent2 = join_plan(2); - let join_plan_indent4 = join_plan(4); + + let mut settings = settings.clone(); + settings.add_filter( + // join_type={} replace with join_type=... to avoid snapshot name issue + format!("join_type={join_type}").as_str(), + "join_type=...", + ); + + insta::allow_duplicates! { + settings.bind( || { + match join_type { JoinType::Inner @@ -589,50 +617,52 @@ fn multi_hash_joins() -> Result<()> { &top_join_on, &join_type, ); - let top_join_plan = - format!("HashJoinExec: mode=Partitioned, join_type={join_type}, on=[(a@0, c@2)]"); - let expected = match join_type { + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + + match join_type { // Should include 3 RepartitionExecs - JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => vec![ - top_join_plan.as_str(), - &join_plan_indent2, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], + JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => { + + assert_plan!(plan_distrib, @r" + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, c@2)] + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + }, // Should include 4 RepartitionExecs - _ => vec![ - top_join_plan.as_str(), - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - &join_plan_indent4, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], + _ => { + assert_plan!(plan_distrib, @r" + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, c@2)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + }, }; - let test_config = TestConfig::default(); - test_config.run(&expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected, top_join, &SORT_DISTRIB_DISTRIB)?; + + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); } JoinType::RightSemi | JoinType::RightAnti | JoinType::RightMark => {} } + + match join_type { JoinType::Inner | JoinType::Left @@ -650,55 +680,58 @@ fn multi_hash_joins() -> Result<()> { let top_join = hash_join_exec(join, parquet_exec(), &top_join_on, &join_type); - let top_join_plan = match join_type { - JoinType::RightSemi | JoinType::RightAnti => - format!("HashJoinExec: mode=Partitioned, join_type={join_type}, on=[(b1@1, c@2)]"), - _ => - format!("HashJoinExec: mode=Partitioned, join_type={join_type}, on=[(b1@6, c@2)]"), - }; - let expected = match join_type { + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + + match join_type { // Should include 3 RepartitionExecs - JoinType::Inner | JoinType::Right | JoinType::RightSemi | JoinType::RightAnti => - vec![ - top_join_plan.as_str(), - &join_plan_indent2, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], + JoinType::Inner | JoinType::Right => { + assert_plan!(parquet_exec(), @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet"); + }, + // Should include 3 RepartitionExecs but have a different "on" + JoinType::RightSemi | JoinType::RightAnti => { + assert_plan!(plan_distrib, @r" + HashJoinExec: mode=Partitioned, join_type=..., on=[(b1@1, c@2)] + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + + } + // Should include 4 RepartitionExecs - _ => - vec![ - top_join_plan.as_str(), - " RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10", - &join_plan_indent4, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], + _ => { + assert_plan!(plan_distrib, @r" + HashJoinExec: mode=Partitioned, join_type=..., on=[(b1@6, c@2)] + RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10 + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + + }, }; - let test_config = TestConfig::default(); - test_config.run(&expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected, top_join, &SORT_DISTRIB_DISTRIB)?; + + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); } JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => {} } + + }); + } } Ok(()) @@ -737,23 +770,24 @@ fn multi_joins_after_alias() -> Result<()> { ); // Output partition need to respect the Alias and should not introduce additional RepartitionExec - let expected = &[ - "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, c@2)]", - " ProjectionExec: expr=[a@0 as a1, a@0 as a2]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, top_join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, c@2)] + ProjectionExec: expr=[a@0 as a1, a@0 as a2] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); // Join on (a2 == c) let top_join_on = vec![( @@ -764,23 +798,24 @@ fn multi_joins_after_alias() -> Result<()> { let top_join = hash_join_exec(projection, right, &top_join_on, &JoinType::Inner); // Output partition need to respect the Alias and should not introduce additional RepartitionExec - let expected = &[ - "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a2@1, c@2)]", - " ProjectionExec: expr=[a@0 as a1, a@0 as a2]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, top_join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a2@1, c@2)] + ProjectionExec: expr=[a@0 as a1, a@0 as a2] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -816,26 +851,26 @@ fn multi_joins_after_multi_alias() -> Result<()> { // The Column 'a' has different meaning now after the two Projections // The original Output partition can not satisfy the Join requirements and need to add an additional RepartitionExec - let expected = &[ - "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, c@2)]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " ProjectionExec: expr=[c1@0 as a]", - " ProjectionExec: expr=[c@2 as c1]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, top_join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, c@2)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + ProjectionExec: expr=[c1@0 as a] + ProjectionExec: expr=[c@2 as c1] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -861,22 +896,26 @@ fn join_after_agg_alias() -> Result<()> { let join = hash_join_exec(left, right.clone(), &join_on, &JoinType::Inner); // Only two RepartitionExecs added - let expected = &[ - "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, a2@0)]", - " AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]", - " RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a2], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, a2@0)] + AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[] + RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[] + RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a2], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -914,23 +953,27 @@ fn hash_join_key_ordering() -> Result<()> { let join = hash_join_exec(left, right.clone(), &join_on, &JoinType::Inner); // Only two RepartitionExecs added - let expected = &[ - "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b1@1, b@0), (a1@0, a@1)]", - " ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]", - " AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b1@1, b@0), (a1@0, a@1)] + ProjectionExec: expr=[a1@1 as a1, b1@0 as b1] + AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[] + RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[] + RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1034,30 +1077,31 @@ fn multi_hash_join_key_ordering() -> Result<()> { Arc::new(FilterExec::try_new(predicate, top_join)?); // The bottom joins' join key ordering is adjusted based on the top join. And the top join should not introduce additional RepartitionExec - let expected = &[ - "FilterExec: c@6 > 1", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(B@2, b1@6), (C@3, c@2), (AA@1, a1@5)]", - " ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)]", - " RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)]", - " RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, filter_top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, filter_top_join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = + test_config.to_plan(filter_top_join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + FilterExec: c@6 > 1 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(B@2, b1@6), (C@3, c@2), (AA@1, a1@5)] + ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)] + RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)] + RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(filter_top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1168,34 +1212,30 @@ fn reorder_join_keys_to_left_input() -> Result<()> { &top_join_on, &join_type, ); - let top_join_plan = - format!("HashJoinExec: mode=Partitioned, join_type={:?}, on=[(AA@1, a1@5), (B@2, b1@6), (C@3, c@2)]", &join_type); - let reordered = reorder_join_keys_to_inputs(top_join)?; + let reordered = reorder_join_keys_to_inputs(top_join).unwrap(); // The top joins' join key ordering is adjusted based on the children inputs. - let expected = &[ - top_join_plan.as_str(), - " ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1), (c@2, c1@2)]", - " RepartitionExec: partitioning=Hash([a@0, b@1, c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([a1@0, b1@1, c1@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)]", - " RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - - assert_plan_txt!(expected, reordered); + let (captured_join_type, modified_plan) = + hide_first(reordered.as_ref(), r"join_type=(\w+)", "join_type=..."); + assert_eq!(captured_join_type, join_type.to_string()); + + insta::allow_duplicates! {insta::assert_snapshot!(modified_plan, @r" +HashJoinExec: mode=Partitioned, join_type=..., on=[(AA@1, a1@5), (B@2, b1@6), (C@3, c@2)] + ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1), (c@2, c1@2)] + RepartitionExec: partitioning=Hash([a@0, b@1, c@2], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([a1@0, b1@1, c1@2], 10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)] + RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +");} } Ok(()) @@ -1302,34 +1342,28 @@ fn reorder_join_keys_to_right_input() -> Result<()> { &top_join_on, &join_type, ); - let top_join_plan = - format!("HashJoinExec: mode=Partitioned, join_type={:?}, on=[(C@3, c@2), (B@2, b1@6), (AA@1, a1@5)]", &join_type); - let reordered = reorder_join_keys_to_inputs(top_join)?; + let reordered = reorder_join_keys_to_inputs(top_join).unwrap(); // The top joins' join key ordering is adjusted based on the children inputs. - let expected = &[ - top_join_plan.as_str(), - " ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1)]", - " RepartitionExec: partitioning=Hash([a@0, b@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([a1@0, b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)]", - " RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - - assert_plan_txt!(expected, reordered); + let (_, plan_str) = + hide_first(reordered.as_ref(), r"join_type=(\w+)", "join_type=..."); + insta::allow_duplicates! {insta::assert_snapshot!(plan_str, @r" +HashJoinExec: mode=Partitioned, join_type=..., on=[(C@3, c@2), (B@2, b1@6), (AA@1, a1@5)] + ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1)] + RepartitionExec: partitioning=Hash([a@0, b@1], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([a1@0, b1@1], 10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)] + RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +");} } Ok(()) @@ -1369,15 +1403,6 @@ fn multi_smj_joins() -> Result<()> { for join_type in join_types { let join = sort_merge_join_exec(left.clone(), right.clone(), &join_on, &join_type); - let join_plan = |shift| -> String { - format!( - "{}SortMergeJoin: join_type={join_type}, on=[(a@0, b1@1)]", - " ".repeat(shift) - ) - }; - let join_plan_indent2 = join_plan(2); - let join_plan_indent6 = join_plan(6); - let join_plan_indent10 = join_plan(10); // Top join on (a == c) let top_join_on = vec![( @@ -1386,235 +1411,220 @@ fn multi_smj_joins() -> Result<()> { )]; let top_join = sort_merge_join_exec(join.clone(), parquet_exec(), &top_join_on, &join_type); - let top_join_plan = - format!("SortMergeJoin: join_type={join_type}, on=[(a@0, c@2)]"); - - let expected = match join_type { - // Should include 6 RepartitionExecs (3 hash, 3 round-robin), 3 SortExecs - JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti => - vec![ - top_join_plan.as_str(), - &join_plan_indent2, - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], - // Should include 7 RepartitionExecs (4 hash, 3 round-robin), 4 SortExecs - // Since ordering of the left child is not preserved after SortMergeJoin - // when mode is Right, RightSemi, RightAnti, Full - // - We need to add one additional SortExec after SortMergeJoin in contrast the test cases - // when mode is Inner, Left, LeftSemi, LeftAnti - // Similarly, since partitioning of the left side is not preserved - // when mode is Right, RightSemi, RightAnti, Full - // - We need to add one additional Hash Repartition after SortMergeJoin in contrast the test - // cases when mode is Inner, Left, LeftSemi, LeftAnti - _ => vec![ - top_join_plan.as_str(), - // Below 2 operators are differences introduced, when join mode is changed - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - &join_plan_indent6, - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], - }; - // TODO(wiedld): show different test result if enforce sorting first. - test_config.run(&expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - - let expected_first_sort_enforcement = match join_type { - // Should include 6 RepartitionExecs (3 hash, 3 round-robin), 3 SortExecs - JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti => - vec![ - top_join_plan.as_str(), - &join_plan_indent2, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], - // Should include 8 RepartitionExecs (4 hash, 8 round-robin), 4 SortExecs - // Since ordering of the left child is not preserved after SortMergeJoin - // when mode is Right, RightSemi, RightAnti, Full - // - We need to add one additional SortExec after SortMergeJoin in contrast the test cases - // when mode is Inner, Left, LeftSemi, LeftAnti - // Similarly, since partitioning of the left side is not preserved - // when mode is Right, RightSemi, RightAnti, Full - // - We need to add one additional Hash Repartition and Roundrobin repartition after - // SortMergeJoin in contrast the test cases when mode is Inner, Left, LeftSemi, LeftAnti - _ => vec![ - top_join_plan.as_str(), - // Below 4 operators are differences introduced, when join mode is changed - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - &join_plan_indent10, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], - }; - // TODO(wiedld): show different test result if enforce distribution first. - test_config.run( - &expected_first_sort_enforcement, - top_join, - &SORT_DISTRIB_DISTRIB, - )?; - match join_type { - JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => { - // This time we use (b1 == c) for top join - // Join on (b1 == c) - let top_join_on = vec![( - Arc::new(Column::new_with_schema("b1", &join.schema()).unwrap()) as _, - Arc::new(Column::new_with_schema("c", &schema()).unwrap()) as _, - )]; - let top_join = - sort_merge_join_exec(join, parquet_exec(), &top_join_on, &join_type); - let top_join_plan = - format!("SortMergeJoin: join_type={join_type}, on=[(b1@6, c@2)]"); - - let expected = match join_type { - // Should include 6 RepartitionExecs(3 hash, 3 round-robin) and 3 SortExecs - JoinType::Inner | JoinType::Right => vec![ - top_join_plan.as_str(), - &join_plan_indent2, - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], - // Should include 7 RepartitionExecs (4 hash, 3 round-robin) and 4 SortExecs - JoinType::Left | JoinType::Full => vec![ - top_join_plan.as_str(), - " SortExec: expr=[b1@6 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10", - &join_plan_indent6, - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[true]", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], - // this match arm cannot be reached - _ => unreachable!() - }; - // TODO(wiedld): show different test result if enforce sorting first. - test_config.run(&expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - - let expected_first_sort_enforcement = match join_type { - // Should include 6 RepartitionExecs (3 of them preserves order) and 3 SortExecs - JoinType::Inner | JoinType::Right => vec![ - top_join_plan.as_str(), - &join_plan_indent2, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], - // Should include 8 RepartitionExecs (4 of them preserves order) and 4 SortExecs - JoinType::Left | JoinType::Full => vec![ - top_join_plan.as_str(), - " RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@6 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[b1@6 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - &join_plan_indent10, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], - // this match arm cannot be reached - _ => unreachable!() - }; + let mut settings = Settings::clone_current(); + settings.add_filter(&format!("join_type={join_type}"), "join_type=..."); + + #[rustfmt::skip] + insta::allow_duplicates! { + settings.bind(|| { + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + + match join_type { + // Should include 6 RepartitionExecs (3 hash, 3 round-robin), 3 SortExecs + JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti => { + assert_plan!(plan_distrib, @r" +SortMergeJoin: join_type=..., on=[(a@0, c@2)] + SortMergeJoin: join_type=..., on=[(a@0, b1@1)] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[b1@1 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[c@2 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + } + // Should include 7 RepartitionExecs (4 hash, 3 round-robin), 4 SortExecs + // Since ordering of the left child is not preserved after SortMergeJoin + // when mode is Right, RightSemi, RightAnti, Full + // - We need to add one additional SortExec after SortMergeJoin in contrast the test cases + // when mode is Inner, Left, LeftSemi, LeftAnti + // Similarly, since partitioning of the left side is not preserved + // when mode is Right, RightSemi, RightAnti, Full + // - We need to add one additional Hash Repartition after SortMergeJoin in contrast the test + // cases when mode is Inner, Left, LeftSemi, LeftAnti + _ => { + assert_plan!(plan_distrib, @r" +SortMergeJoin: join_type=..., on=[(a@0, c@2)] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + SortMergeJoin: join_type=..., on=[(a@0, b1@1)] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[b1@1 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[c@2 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + } + } - // TODO(wiedld): show different test result if enforce distribution first. - test_config.run( - &expected_first_sort_enforcement, - top_join, - &SORT_DISTRIB_DISTRIB, - )?; - } - _ => {} + let plan_sort = test_config.to_plan(top_join.clone(), &SORT_DISTRIB_DISTRIB); + + match join_type { + // Should include 6 RepartitionExecs (3 hash, 3 round-robin), 3 SortExecs + JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti => { + // TODO(wiedld): show different test result if enforce distribution first. + assert_plan!(plan_sort, @r" +SortMergeJoin: join_type=..., on=[(a@0, c@2)] + SortMergeJoin: join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1 + SortExec: expr=[b1@1 ASC], preserve_partitioning=[false] + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + } + // Should include 8 RepartitionExecs (4 hash, 8 round-robin), 4 SortExecs + // Since ordering of the left child is not preserved after SortMergeJoin + // when mode is Right, RightSemi, RightAnti, Full + // - We need to add one additional SortExec after SortMergeJoin in contrast the test cases + // when mode is Inner, Left, LeftSemi, LeftAnti + // Similarly, since partitioning of the left side is not preserved + // when mode is Right, RightSemi, RightAnti, Full + // - We need to add one additional Hash Repartition and Roundrobin repartition after + // SortMergeJoin in contrast the test cases when mode is Inner, Left, LeftSemi, LeftAnti + _ => { + // TODO(wiedld): show different test result if enforce distribution first. + assert_plan!(plan_sort, @r" +SortMergeJoin: join_type=..., on=[(a@0, c@2)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + SortMergeJoin: join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1 + SortExec: expr=[b1@1 ASC], preserve_partitioning=[false] + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + } + } + + match join_type { + JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => { + // This time we use (b1 == c) for top join + // Join on (b1 == c) + let top_join_on = vec![( + Arc::new(Column::new_with_schema("b1", &join.schema()).unwrap()) as _, + Arc::new(Column::new_with_schema("c", &schema()).unwrap()) as _, + )]; + let top_join = sort_merge_join_exec(join, parquet_exec(), &top_join_on, &join_type); + + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + + match join_type { + // Should include 6 RepartitionExecs(3 hash, 3 round-robin) and 3 SortExecs + JoinType::Inner | JoinType::Right => { + // TODO(wiedld): show different test result if enforce sorting first. + assert_plan!(plan_distrib, @r" +SortMergeJoin: join_type=..., on=[(b1@6, c@2)] + SortMergeJoin: join_type=..., on=[(a@0, b1@1)] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[b1@1 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[c@2 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + } + // Should include 7 RepartitionExecs (4 hash, 3 round-robin) and 4 SortExecs + JoinType::Left | JoinType::Full => { + // TODO(wiedld): show different test result if enforce sorting first. + assert_plan!(plan_distrib, @r" +SortMergeJoin: join_type=..., on=[(b1@6, c@2)] + SortExec: expr=[b1@6 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10 + SortMergeJoin: join_type=..., on=[(a@0, b1@1)] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[b1@1 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[c@2 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + } + // this match arm cannot be reached + _ => unreachable!() + } + + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + + match join_type { + // Should include 6 RepartitionExecs (3 of them preserves order) and 3 SortExecs + JoinType::Inner | JoinType::Right => { + // TODO(wiedld): show different test result if enforce distribution first. + assert_plan!(plan_sort, @r" +SortMergeJoin: join_type=..., on=[(b1@6, c@2)] + SortMergeJoin: join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1 + SortExec: expr=[b1@1 ASC], preserve_partitioning=[false] + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + } + // Should include 8 RepartitionExecs (4 of them preserves order) and 4 SortExecs + JoinType::Left | JoinType::Full => { + // TODO(wiedld): show different test result if enforce distribution first. + assert_plan!(plan_sort, @r" +SortMergeJoin: join_type=..., on=[(b1@6, c@2)] + RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=1 + SortExec: expr=[b1@6 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + SortMergeJoin: join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1 + SortExec: expr=[b1@1 ASC], preserve_partitioning=[false] + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + } + // this match arm cannot be reached + _ => unreachable!() + } + } + _ => {} + } + }); } } - Ok(()) } @@ -1670,52 +1680,50 @@ fn smj_join_key_ordering() -> Result<()> { // Test: run EnforceDistribution, then EnforceSort. // Only two RepartitionExecs added - let expected = &[ - "SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)]", - " SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[true]", - " ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]", - " ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]", - " AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[true]", - " ProjectionExec: expr=[a@1 as a2, b@0 as b2]", - " AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected, join.clone(), &DISTRIB_DISTRIB_SORT)?; + let plan_distrib = test_config.to_plan(join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, @r" +SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)] + SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[true] + ProjectionExec: expr=[a1@0 as a3, b1@1 as b3] + ProjectionExec: expr=[a1@1 as a1, b1@0 as b1] + AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[] + RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[true] + ProjectionExec: expr=[a@1 as a2, b@0 as b2] + AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[] + RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)]", - " RepartitionExec: partitioning=Hash([b3@1, a3@0], 10), input_partitions=10, preserve_order=true, sort_exprs=b3@1 ASC, a3@0 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]", - " ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]", - " AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b2@1, a2@0], 10), input_partitions=10, preserve_order=true, sort_exprs=b2@1 ASC, a2@0 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " ProjectionExec: expr=[a@1 as a2, b@0 as b2]", - " AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, join, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, @r" +SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)] + RepartitionExec: partitioning=Hash([b3@1, a3@0], 10), input_partitions=1 + SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + ProjectionExec: expr=[a1@0 as a3, b1@1 as b3] + ProjectionExec: expr=[a1@1 as a1, b1@0 as b1] + AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[] + RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b2@1, a2@0], 10), input_partitions=1 + SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + ProjectionExec: expr=[a@1 as a2, b@0 as b2] + AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[] + RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); Ok(()) } @@ -1744,13 +1752,14 @@ fn merge_does_not_need_sort() -> Result<()> { // // The optimizer should not add an additional SortExec as the // data is already sorted - let expected = &[ - "SortPreservingMergeExec: [a@0 ASC]", - " CoalesceBatchesExec: target_batch_size=4096", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, exec.clone(), &DISTRIB_DISTRIB_SORT)?; + let plan_distrib = test_config.to_plan(exec.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [a@0 ASC] + CoalesceBatchesExec: target_batch_size=4096 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); // Test: result IS DIFFERENT, if EnforceSorting is run first: // @@ -1758,13 +1767,14 @@ fn merge_does_not_need_sort() -> Result<()> { // (according to flag: PREFER_EXISTING_SORT) // hence in this case ordering lost during CoalescePartitionsExec and re-introduced with // SortExec at the top. - let expected_first_sort_enforcement = &[ - "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " CoalesceBatchesExec: target_batch_size=4096", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, exec, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(exec, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + CoalesceBatchesExec: target_batch_size=4096 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); Ok(()) } @@ -1790,25 +1800,26 @@ fn union_to_interleave() -> Result<()> { aggregate_exec_with_alias(plan, vec![("a1".to_string(), "a2".to_string())]); // Only two RepartitionExecs added, no final RepartitionExec required - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]", - " AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[]", - " InterleaveExec", - " AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[] + AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[] + InterleaveExec + AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[] + RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[] + RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1834,28 +1845,29 @@ fn union_not_to_interleave() -> Result<()> { aggregate_exec_with_alias(plan, vec![("a1".to_string(), "a2".to_string())]); // Only two RepartitionExecs added, no final RepartitionExec required - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]", - " RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=20", - " AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[]", - " UnionExec", - " AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - // TestConfig: Prefer existing union. let test_config = TestConfig::default().with_prefer_existing_union(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[] + RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=20 + AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[] + UnionExec + AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[] + RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[] + RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1865,17 +1877,18 @@ fn added_repartition_to_single_partition() -> Result<()> { let alias = vec![("a".to_string(), "a".to_string())]; let plan = aggregate_exec_with_alias(parquet_exec(), alias); - let expected = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(&expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1885,18 +1898,19 @@ fn repartition_deepest_node() -> Result<()> { let alias = vec![("a".to_string(), "a".to_string())]; let plan = aggregate_exec_with_alias(filter_exec(parquet_exec()), alias); - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1905,19 +1919,20 @@ fn repartition_deepest_node() -> Result<()> { fn repartition_unsorted_limit() -> Result<()> { let plan = limit_exec(filter_exec(parquet_exec())); - let expected = &[ - "GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // nothing sorts the data, so the local limit doesn't require sorted data either - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" + GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + // nothing sorts the data, so the local limit doesn't require sorted data either + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1932,17 +1947,18 @@ fn repartition_sorted_limit() -> Result<()> { .into(); let plan = limit_exec(sort_exec(sort_key, parquet_exec())); - let expected = &[ - "GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - // data is sorted so can't repartition here - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // data is sorted so can't repartition here + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1960,19 +1976,20 @@ fn repartition_sorted_limit_with_filter() -> Result<()> { sort_key, ); - let expected = &[ - "SortRequiredExec: [c@2 ASC]", - " FilterExec: c@2 = 0", - // We can use repartition here, ordering requirement by SortRequiredExec - // is still satisfied. - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortRequiredExec: [c@2 ASC] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // We can use repartition here, ordering requirement by SortRequiredExec + // is still satisfied. + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1985,26 +2002,28 @@ fn repartition_ignores_limit() -> Result<()> { alias, ); - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // repartition should happen prior to the filter to maximize parallelism - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - // Expect no repartition to happen for local limit - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // repartition should happen prior to the filter to maximize parallelism + // Expect no repartition to happen for local limit (DataSourceExec) + + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2013,19 +2032,20 @@ fn repartition_ignores_limit() -> Result<()> { fn repartition_ignores_union() -> Result<()> { let plan = union_exec(vec![parquet_exec(); 5]); - let expected = &[ - "UnionExec", - // Expect no repartition of DataSourceExec - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Expect no repartition of DataSourceExec + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2041,15 +2061,15 @@ fn repartition_through_sort_preserving_merge() -> Result<()> { .into(); let plan = sort_preserving_merge_exec(sort_key, parquet_exec()); - // need resort as the data was not sorted correctly - let expected = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2068,24 +2088,25 @@ fn repartition_ignores_sort_preserving_merge() -> Result<()> { parquet_exec_multiple_sorted(vec![sort_key]), ); + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); // Test: run EnforceDistribution, then EnforceSort - // + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [c@2 ASC] + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); // should not sort (as the data was already sorted) // should not repartition, since increased parallelism is not beneficial for SortPReservingMerge - let expected = &[ - "SortPreservingMergeExec: [c@2 ASC]", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); Ok(()) } @@ -2105,27 +2126,29 @@ fn repartition_ignores_sort_preserving_merge_with_union() -> Result<()> { ]); let plan = sort_preserving_merge_exec(sort_key, input); + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); // Test: run EnforceDistribution, then EnforceSort. + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [c@2 ASC] + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); // // should not repartition / sort (as the data was already sorted) - let expected = &[ - "SortPreservingMergeExec: [c@2 ASC]", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; // test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); Ok(()) } @@ -2149,16 +2172,17 @@ fn repartition_does_not_destroy_sort() -> Result<()> { // TestConfig: Prefer existing sort. let test_config = TestConfig::default().with_prefer_existing_sort(); + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortRequiredExec: [d@3 ASC] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet +"); // during repartitioning ordering is preserved - let expected = &[ - "SortRequiredExec: [d@3 ASC]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet", - ]; - - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2187,22 +2211,25 @@ fn repartition_does_not_destroy_sort_more_complex() -> Result<()> { let input2 = filter_exec(parquet_exec()); let plan = union_exec(vec![input1, input2]); + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +UnionExec + SortRequiredExec: [c@2 ASC] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // union input 1: no repartitioning + // union input 2: should repartition + // // should not repartition below the SortRequired as that // branch doesn't benefit from increased parallelism - let expected = &[ - "UnionExec", - // union input 1: no repartitioning - " SortRequiredExec: [c@2 ASC]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - // union input 2: should repartition - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2227,28 +2254,28 @@ fn repartition_transitively_with_projection() -> Result<()> { .into(); let plan = sort_preserving_merge_exec(sort_key, proj); - // Test: run EnforceDistribution, then EnforceSort. - let expected = &[ - "SortPreservingMergeExec: [sum@0 ASC]", - " SortExec: expr=[sum@0 ASC], preserve_partitioning=[true]", - // Since this projection is not trivial, increasing parallelism is beneficial - " ProjectionExec: expr=[a@0 + b@1 as sum]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [sum@0 ASC] + SortExec: expr=[sum@0 ASC], preserve_partitioning=[true] + ProjectionExec: expr=[a@0 + b@1 as sum] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[sum@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - // Since this projection is not trivial, increasing parallelism is beneficial - " ProjectionExec: expr=[a@0 + b@1 as sum]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[sum@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + ProjectionExec: expr=[a@0 + b@1 as sum] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Since this projection is not trivial, increasing parallelism is beneficial Ok(()) } @@ -2275,16 +2302,18 @@ fn repartition_ignores_transitively_with_projection() -> Result<()> { sort_key, ); - let expected = &[ - "SortRequiredExec: [c@2 ASC]", - // Since this projection is trivial, increasing parallelism is not beneficial - " ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortRequiredExec: [c@2 ASC] + ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c] + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + // Since this projection is trivial, increasing parallelism is not beneficial + + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2310,16 +2339,17 @@ fn repartition_transitively_past_sort_with_projection() -> Result<()> { ), ); - let expected = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // Since this projection is trivial, increasing parallelism is not beneficial - " ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Since this projection is trivial, increasing parallelism is not beneficial + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2334,28 +2364,30 @@ fn repartition_transitively_past_sort_with_filter() -> Result<()> { .into(); let plan = sort_exec(sort_key, filter_exec(parquet_exec())); - // Test: run EnforceDistribution, then EnforceSort. - let expected = &[ - "SortPreservingMergeExec: [a@0 ASC]", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - // Expect repartition on the input to the sort (as it can benefit from additional parallelism) - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [a@0 ASC] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + + // Expect repartition on the input to the sort (as it can benefit from additional parallelism) // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " FilterExec: c@2 = 0", - // Expect repartition on the input of the filter (as it can benefit from additional parallelism) - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Expect repartition on the input of the filter (as it can benefit from additional parallelism) Ok(()) } @@ -2381,30 +2413,32 @@ fn repartition_transitively_past_sort_with_projection_and_filter() -> Result<()> ), ); - // Test: run EnforceDistribution, then EnforceSort. - let expected = &[ - "SortPreservingMergeExec: [a@0 ASC]", - // Expect repartition on the input to the sort (as it can benefit from additional parallelism) - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]", - " FilterExec: c@2 = 0", - // repartition is lowest down - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [a@0 ASC] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + + // Expect repartition on the input to the sort (as it can benefit from additional parallelism) + // repartition is lowest down // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); Ok(()) } @@ -2420,28 +2454,29 @@ fn parallelization_single_partition() -> Result<()> { .with_query_execution_partitions(2); // Test: with parquet - let expected_parquet = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - &expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(&expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(&expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2465,40 +2500,31 @@ fn parallelization_multiple_files() -> Result<()> { // The groups must have only contiguous ranges of rows from the same file // if any group has rows from multiple files, the data is no longer sorted destroyed // https://github.com/apache/datafusion/issues/8451 - let expected_with_3_target_partitions = [ - "SortRequiredExec: [a@0 ASC]", - " FilterExec: c@2 = 0", - " DataSourceExec: file_groups={3 groups: [[x:0..50], [y:0..100], [x:50..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; let test_config_concurrency_3 = test_config.clone().with_query_execution_partitions(3); - test_config_concurrency_3.run( - &expected_with_3_target_partitions, - plan.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config_concurrency_3.run( - &expected_with_3_target_partitions, - plan.clone(), - &SORT_DISTRIB_DISTRIB, - )?; + let plan_3_distrib = + test_config_concurrency_3.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_3_distrib, + @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + DataSourceExec: file_groups={3 groups: [[x:0..50], [y:0..100], [x:50..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); + let plan_3_sort = + test_config_concurrency_3.to_plan(plan.clone(), &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_3_distrib, plan_3_sort); - let expected_with_8_target_partitions = [ - "SortRequiredExec: [a@0 ASC]", - " FilterExec: c@2 = 0", - " DataSourceExec: file_groups={8 groups: [[x:0..25], [y:0..25], [x:25..50], [y:25..50], [x:50..75], [y:50..75], [x:75..100], [y:75..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; let test_config_concurrency_8 = test_config.with_query_execution_partitions(8); - test_config_concurrency_8.run( - &expected_with_8_target_partitions, - plan.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config_concurrency_8.run( - &expected_with_8_target_partitions, - plan, - &SORT_DISTRIB_DISTRIB, - )?; + let plan_8_distrib = + test_config_concurrency_8.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_8_distrib, + @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + DataSourceExec: file_groups={8 groups: [[x:0..25], [y:0..25], [x:25..50], [y:25..50], [x:50..75], [y:50..75], [x:75..100], [y:75..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); + let plan_8_sort = test_config_concurrency_8.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_8_distrib, plan_8_sort); Ok(()) } @@ -2515,46 +2541,55 @@ fn parallelization_compressed_csv() -> Result<()> { FileCompressionType::UNCOMPRESSED, ]; - let expected_not_partitioned = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - - let expected_partitioned = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; + #[rustfmt::skip] + insta::allow_duplicates! { + for compression_type in compression_types { + let plan = aggregate_exec_with_alias( + DataSourceExec::from_data_source( + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), { + let options = CsvOptions { + has_header: Some(false), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + Arc::new(CsvSource::new(schema()).with_csv_options(options)) + }) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .with_file_compression_type(compression_type) + .build(), + ), + vec![("a".to_string(), "a".to_string())], + ); + let test_config = TestConfig::default() + .with_query_execution_partitions(2) + .with_prefer_repartition_file_scans(10); + + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + if compression_type.is_compressed() { + // Compressed files cannot be partitioned + assert_plan!(plan_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + } else { + // Uncompressed files can be partitioned + assert_plan!(plan_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + } - for compression_type in compression_types { - let expected = if compression_type.is_compressed() { - &expected_not_partitioned[..] - } else { - &expected_partitioned[..] - }; - - let plan = aggregate_exec_with_alias( - DataSourceExec::from_data_source( - FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(CsvSource::new(false, b',', b'"')), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_file_compression_type(compression_type) - .build(), - ), - vec![("a".to_string(), "a".to_string())], - ); - let test_config = TestConfig::default() - .with_query_execution_partitions(2) - .with_prefer_repartition_file_scans(10); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); + } } Ok(()) } @@ -2570,30 +2605,30 @@ fn parallelization_two_partitions() -> Result<()> { .with_prefer_repartition_file_scans(10); // Test: with parquet - let expected_parquet = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - // Plan already has two partitions - " DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - &expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(&expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Plan already has two partitions + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - // Plan already has two partitions - " DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(&expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + // Plan already has two partitions + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2609,30 +2644,32 @@ fn parallelization_two_partitions_into_four() -> Result<()> { .with_prefer_repartition_file_scans(10); // Test: with parquet - let expected_parquet = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - // Multiple source files split across partitions - " DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - &expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(&expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + // Multiple source files split across partitions + assert_plan!(plan_parquet_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Multiple source files split across partitions + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - // Multiple source files split across partitions - " DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(&expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + // Multiple source files split across partitions + assert_plan!(plan_csv_distrib, @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + // Multiple source files split across partitions + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2651,32 +2688,32 @@ fn parallelization_sorted_limit() -> Result<()> { let test_config = TestConfig::default(); // Test: with parquet - let expected_parquet = &[ - "GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - // data is sorted so can't repartition here - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // Doesn't parallelize for SortExec without preserve_partitioning - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, @r" +GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // data is sorted so can't repartition here + // Doesn't parallelize for SortExec without preserve_partitioning + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - // data is sorted so can't repartition here - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // Doesn't parallelize for SortExec without preserve_partitioning - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" +GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + // data is sorted so can't repartition here + // Doesn't parallelize for SortExec without preserve_partitioning + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2696,40 +2733,41 @@ fn parallelization_limit_with_filter() -> Result<()> { let test_config = TestConfig::default(); // Test: with parquet - let expected_parquet = &[ - "GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // even though data is sorted, we can use repartition here. Since - // ordering is not used in subsequent stages anyway. - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // SortExec doesn't benefit from input partitioning - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + // even though data is sorted, we can use repartition here. Since + // ordering is not used in subsequent stages anyway. + // SortExec doesn't benefit from input partitioning + assert_plan!(plan_parquet_distrib, + @r" +GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // even though data is sorted, we can use repartition here. Since - // ordering is not used in subsequent stages anyway. - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // SortExec doesn't benefit from input partitioning - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + // even though data is sorted, we can use repartition here. Since + // ordering is not used in subsequent stages anyway. + // SortExec doesn't benefit from input partitioning + assert_plan!(plan_csv_distrib, + @r" +GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2747,48 +2785,49 @@ fn parallelization_ignores_limit() -> Result<()> { let test_config = TestConfig::default(); // Test: with parquet - let expected_parquet = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // repartition should happen prior to the filter to maximize parallelism - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - // Limit doesn't benefit from input partitioning - no parallelism - " LocalLimitExec: fetch=100", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + // repartition should happen prior to the filter to maximize parallelism + // Limit doesn't benefit from input partitioning - no parallelism + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // repartition should happen prior to the filter to maximize parallelism - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - // Limit doesn't benefit from input partitioning - no parallelism - " LocalLimitExec: fetch=100", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + "); + // repartition should happen prior to the filter to maximize parallelism + // Limit doesn't benefit from input partitioning - no parallelism + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2801,34 +2840,35 @@ fn parallelization_union_inputs() -> Result<()> { let test_config = TestConfig::default(); // Test: with parquet - let expected_parquet = &[ - "UnionExec", - // Union doesn't benefit from input partitioning - no parallelism - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" +UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Union doesn't benefit from input partitioning - no parallelism + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "UnionExec", - // Union doesn't benefit from input partitioning - no parallelism - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" +UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + // Union doesn't benefit from input partitioning - no parallelism + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2855,22 +2895,21 @@ fn parallelization_prior_to_sort_preserving_merge() -> Result<()> { // parallelization is not beneficial for SortPreservingMerge // Test: with parquet - let expected_parquet = &[ - "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet" + ); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false" + ); + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2900,54 +2939,47 @@ fn parallelization_sort_preserving_merge_with_union() -> Result<()> { // should not sort (as the data was already sorted) // Test: with parquet - let expected_parquet = &[ - "SortPreservingMergeExec: [c@2 ASC]", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - let expected_parquet_first_sort_enforcement = &[ - // no SPM - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // has coalesce - " CoalescePartitionsExec", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run( - expected_parquet_first_sort_enforcement, - plan_parquet, - &SORT_DISTRIB_DISTRIB, - )?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" + SortPreservingMergeExec: [c@2 ASC] + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + "); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_sort, + @r" + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + "); + // no SPM + // has coalesce // Test: with csv - let expected_csv = &[ - "SortPreservingMergeExec: [c@2 ASC]", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - let expected_csv_first_sort_enforcement = &[ - // no SPM - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // has coalesce - " CoalescePartitionsExec", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - test_config.run( - expected_csv_first_sort_enforcement, - plan_csv.clone(), - &SORT_DISTRIB_DISTRIB, - )?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" + SortPreservingMergeExec: [c@2 ASC] + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false + "); + let plan_csv_sort = test_config.to_plan(plan_csv.clone(), &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_sort, + @r" + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false + "); + // no SPM + // has coalesce Ok(()) } @@ -2975,24 +3007,25 @@ fn parallelization_does_not_benefit() -> Result<()> { // no parallelization, because SortRequiredExec doesn't benefit from increased parallelism // Test: with parquet - let expected_parquet = &[ - "SortRequiredExec: [c@2 ASC]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" + SortRequiredExec: [c@2 ASC] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + "); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "SortRequiredExec: [c@2 ASC]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" + SortRequiredExec: [c@2 ASC] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false + "); + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -3023,26 +3056,26 @@ fn parallelization_ignores_transitively_with_projection_parquet() -> Result<()> .into(); let plan_parquet = sort_preserving_merge_exec(sort_key_after_projection, proj_parquet); - let expected = &[ - "SortPreservingMergeExec: [c2@1 ASC]", - " ProjectionExec: expr=[a@0 as a2, c@2 as c2]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - plans_matches_expected!(expected, &plan_parquet); + assert_plan!(plan_parquet, + @r" + SortPreservingMergeExec: [c2@1 ASC] + ProjectionExec: expr=[a@0 as a2, c@2 as c2] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + "); + + let test_config = TestConfig::default(); + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); // Expected Outcome: // data should not be repartitioned / resorted - let expected_parquet = &[ - "ProjectionExec: expr=[a@0 as a2, c@2 as c2]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + assert_plan!(plan_parquet_distrib, + @r" +ProjectionExec: expr=[a@0 as a2, c@2 as c2] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); Ok(()) } @@ -3071,22 +3104,24 @@ fn parallelization_ignores_transitively_with_projection_csv() -> Result<()> { }] .into(); let plan_csv = sort_preserving_merge_exec(sort_key_after_projection, proj_csv); - let expected = &[ - "SortPreservingMergeExec: [c2@1 ASC]", - " ProjectionExec: expr=[a@0 as a2, c@2 as c2]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - plans_matches_expected!(expected, &plan_csv); + assert_plan!(plan_csv, + @r" +SortPreservingMergeExec: [c2@1 ASC] + ProjectionExec: expr=[a@0 as a2, c@2 as c2] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false +"); + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +ProjectionExec: expr=[a@0 as a2, c@2 as c2] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false +"); // Expected Outcome: // data should not be repartitioned / resorted - let expected_csv = &[ - "ProjectionExec: expr=[a@0 as a2, c@2 as c2]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - let test_config = TestConfig::default(); - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3096,24 +3131,25 @@ fn remove_redundant_roundrobins() -> Result<()> { let input = parquet_exec(); let repartition = repartition_exec(repartition_exec(input)); let physical_plan = repartition_exec(filter_exec(repartition)); - let expected = &[ - "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - plans_matches_expected!(expected, &physical_plan); - - let expected = &[ - "FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; + assert_plan!(physical_plan, + @r" +RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); let test_config = TestConfig::default(); - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3133,18 +3169,19 @@ fn remove_unnecessary_spm_after_filter() -> Result<()> { // TestConfig: Prefer existing sort. let test_config = TestConfig::default().with_prefer_existing_sort(); + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); // Expected Outcome: // Original plan expects its output to be ordered by c@2 ASC. // This is still satisfied since, after filter that column is constant. - let expected = &[ - "CoalescePartitionsExec", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c@2 ASC", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + assert_plan!(plan_distrib, + @r" +CoalescePartitionsExec + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c@2 ASC + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3164,14 +3201,16 @@ fn preserve_ordering_through_repartition() -> Result<()> { // TestConfig: Prefer existing sort. let test_config = TestConfig::default().with_prefer_existing_sort(); - let expected = &[ - "SortPreservingMergeExec: [d@3 ASC]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=d@3 ASC", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [d@3 ASC] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=d@3 ASC + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3189,29 +3228,27 @@ fn do_not_preserve_ordering_through_repartition() -> Result<()> { let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); // Test: run EnforceDistribution, then EnforceSort. - let expected = &[ - "SortPreservingMergeExec: [a@0 ASC]", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [a@0 ASC] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; - test_config.run( - expected_first_sort_enforcement, - physical_plan, - &SORT_DISTRIB_DISTRIB, - )?; + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); Ok(()) } @@ -3227,17 +3264,18 @@ fn no_need_for_sort_after_filter() -> Result<()> { let input = parquet_exec_multiple_sorted(vec![sort_key.clone()]); let physical_plan = sort_preserving_merge_exec(sort_key, filter_exec(input)); - let expected = &[ - // After CoalescePartitionsExec c is still constant. Hence c@2 ASC ordering is already satisfied. - "CoalescePartitionsExec", - // Since after this stage c is constant. c@2 ASC ordering is already satisfied. - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, @r" +CoalescePartitionsExec + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); + // After CoalescePartitionsExec c is still constant. Hence c@2 ASC ordering is already satisfied. + // Since after this stage c is constant. c@2 ASC ordering is already satisfied. Ok(()) } @@ -3261,30 +3299,28 @@ fn do_not_preserve_ordering_through_repartition2() -> Result<()> { let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); // Test: run EnforceDistribution, then EnforceSort. - let expected = &[ - "SortPreservingMergeExec: [a@0 ASC]", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [a@0 ASC] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run( - expected_first_sort_enforcement, - physical_plan, - &SORT_DISTRIB_DISTRIB, - )?; + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); Ok(()) } @@ -3300,14 +3336,16 @@ fn do_not_preserve_ordering_through_repartition3() -> Result<()> { let input = parquet_exec_multiple_sorted(vec![sort_key]); let physical_plan = filter_exec(input); - let expected = &[ - "FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3322,30 +3360,27 @@ fn do_not_put_sort_when_input_is_invalid() -> Result<()> { .into(); let input = parquet_exec(); let physical_plan = sort_required_exec_with_req(filter_exec(input), sort_key); - let expected = &[ - // Ordering requirement of sort required exec is NOT satisfied - // by existing ordering at the source. - "SortRequiredExec: [a@0 ASC]", - " FilterExec: c@2 = 0", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - assert_plan_txt!(expected, physical_plan); - - let expected = &[ - "SortRequiredExec: [a@0 ASC]", - // Since at the start of the rule ordering requirement is not satisfied - // EnforceDistribution rule doesn't satisfy this requirement either. - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; + // Ordering requirement of sort required exec is NOT satisfied + // by existing ordering at the source. + assert_plan!(physical_plan, @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); let mut config = ConfigOptions::new(); config.execution.target_partitions = 10; config.optimizer.enable_round_robin_repartition = true; config.optimizer.prefer_existing_sort = false; let dist_plan = EnforceDistribution::new().optimize(physical_plan, &config)?; - assert_plan_txt!(expected, dist_plan); + // Since at the start of the rule ordering requirement is not satisfied + // EnforceDistribution rule doesn't satisfy this requirement either. + assert_plan!(dist_plan, @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); Ok(()) } @@ -3361,29 +3396,26 @@ fn put_sort_when_input_is_valid() -> Result<()> { let input = parquet_exec_multiple_sorted(vec![sort_key.clone()]); let physical_plan = sort_required_exec_with_req(filter_exec(input), sort_key); - let expected = &[ - // Ordering requirement of sort required exec is satisfied - // by existing ordering at the source. - "SortRequiredExec: [a@0 ASC]", - " FilterExec: c@2 = 0", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; - assert_plan_txt!(expected, physical_plan); - - let expected = &[ - // Since at the start of the rule ordering requirement is satisfied - // EnforceDistribution rule satisfy this requirement also. - "SortRequiredExec: [a@0 ASC]", - " FilterExec: c@2 = 0", - " DataSourceExec: file_groups={10 groups: [[x:0..20], [y:0..20], [x:20..40], [y:20..40], [x:40..60], [y:40..60], [x:60..80], [y:60..80], [x:80..100], [y:80..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; + // Ordering requirement of sort required exec is satisfied + // by existing ordering at the source. + assert_plan!(physical_plan, @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); let mut config = ConfigOptions::new(); config.execution.target_partitions = 10; config.optimizer.enable_round_robin_repartition = true; config.optimizer.prefer_existing_sort = false; let dist_plan = EnforceDistribution::new().optimize(physical_plan, &config)?; - assert_plan_txt!(expected, dist_plan); + // Since at the start of the rule ordering requirement is satisfied + // EnforceDistribution rule satisfy this requirement also. + assert_plan!(dist_plan, @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + DataSourceExec: file_groups={10 groups: [[x:0..20], [y:0..20], [x:20..40], [y:20..40], [x:40..60], [y:40..60], [x:60..80], [y:60..80], [x:80..100], [y:80..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); Ok(()) } @@ -3404,13 +3436,15 @@ fn do_not_add_unnecessary_hash() -> Result<()> { // Make sure target partition number is 1. In this case hash repartition is unnecessary. let test_config = TestConfig::default().with_query_execution_partitions(1); - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3432,19 +3466,21 @@ fn do_not_add_unnecessary_hash2() -> Result<()> { // Make sure target partition number is larger than 2 (e.g partition number at the source). let test_config = TestConfig::default().with_query_execution_partitions(4); - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - // Since hash requirements of this operator is satisfied. There shouldn't be - // a hash repartition here - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + // Since hash requirements of this operator is satisfied. There shouldn't be + // a hash repartition here + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3452,19 +3488,21 @@ fn do_not_add_unnecessary_hash2() -> Result<()> { #[test] fn optimize_away_unnecessary_repartition() -> Result<()> { let physical_plan = coalesce_partitions_exec(repartition_exec(parquet_exec())); - let expected = &[ - "CoalescePartitionsExec", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - plans_matches_expected!(expected, physical_plan.clone()); - - let expected = - &["DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet"]; + assert_plan!(physical_plan, + @r" +CoalescePartitionsExec + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); let test_config = TestConfig::default(); - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3474,25 +3512,27 @@ fn optimize_away_unnecessary_repartition2() -> Result<()> { let physical_plan = filter_exec(repartition_exec(coalesce_partitions_exec( filter_exec(repartition_exec(parquet_exec())), ))); - let expected = &[ - "FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " CoalescePartitionsExec", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - plans_matches_expected!(expected, physical_plan.clone()); + assert_plan!(physical_plan, + @r" +FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + CoalescePartitionsExec + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); - let expected = &[ - "FilterExec: c@2 = 0", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +FilterExec: c@2 = 0 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3512,27 +3552,31 @@ async fn test_distribute_sort_parquet() -> Result<()> { let physical_plan = sort_exec(sort_key, parquet_exec_with_stats(10000 * 8192)); // prior to optimization, this is the starting plan - let starting = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - plans_matches_expected!(starting, physical_plan.clone()); + assert_plan!(physical_plan, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); // what the enforce distribution run does. - let expected = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &[Run::Distribution])?; + let plan_distribution = + test_config.to_plan(physical_plan.clone(), &[Run::Distribution]); + assert_plan!(plan_distribution, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet +"); // what the sort parallelization (in enforce sorting), does after the enforce distribution changes - let expected = &[ - "SortPreservingMergeExec: [c@2 ASC]", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[true]", - " DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected, physical_plan, &[Run::Distribution, Run::Sorting])?; + let plan_both = + test_config.to_plan(physical_plan, &[Run::Distribution, Run::Sorting]); + assert_plan!(plan_both, + @r" +SortPreservingMergeExec: [c@2 ASC] + SortExec: expr=[c@2 ASC], preserve_partitioning=[true] + DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet +"); Ok(()) } @@ -3557,12 +3601,12 @@ async fn test_distribute_sort_memtable() -> Result<()> { let physical_plan = dataframe.create_physical_plan().await?; // this is the final, optimized plan - let expected = &[ - "SortPreservingMergeExec: [id@0 ASC NULLS LAST]", - " SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]", - " DataSourceExec: partitions=3, partition_sizes=[34, 33, 33]", - ]; - plans_matches_expected!(expected, physical_plan); + assert_plan!(physical_plan, + @r" +SortPreservingMergeExec: [id@0 ASC NULLS LAST] + SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] + DataSourceExec: partitions=3, partition_sizes=[34, 33, 33] +"); Ok(()) } diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index ad77a453350f..c0cfa46733f1 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -31,7 +31,7 @@ use crate::physical_optimizer::test_utils::{ use arrow::compute::SortOptions; use arrow::datatypes::{DataType, SchemaRef}; -use datafusion_common::config::ConfigOptions; +use datafusion_common::config::{ConfigOptions, CsvOptions}; use datafusion_common::tree_node::{TreeNode, TransformedResult}; use datafusion_common::{Result, TableReference}; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; @@ -72,10 +72,15 @@ fn csv_exec_sorted( schema: &SchemaRef, sort_exprs: impl IntoIterator, ) -> Arc { + let options = CsvOptions { + has_header: Some(false), + delimiter: 0, + quote: 0, + ..Default::default() + }; let mut builder = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), - schema.clone(), - Arc::new(CsvSource::new(false, 0, 0)), + Arc::new(CsvSource::new(schema.clone()).with_csv_options(options)), ) .with_file(PartitionedFile::new("x".to_string(), 100)); if let Some(ordering) = LexOrdering::new(sort_exprs) { @@ -359,6 +364,94 @@ async fn test_union_inputs_different_sorted2() -> Result<()> { Ok(()) } +#[tokio::test] +// Test with `repartition_sorts` enabled to preserve pre-sorted partitions and avoid resorting +async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_repartition_sorts_true( +) -> Result<()> { + assert_snapshot!( + union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl(true).await?, + @r" + Input Plan: + OutputRequirementExec: order_by=[(nullable_col@0, asc)], dist_by=SinglePartition + CoalescePartitionsExec + UnionExec + SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet + + Optimized Plan: + OutputRequirementExec: order_by=[(nullable_col@0, asc)], dist_by=SinglePartition + SortPreservingMergeExec: [nullable_col@0 ASC] + UnionExec + SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet + "); + Ok(()) +} + +#[tokio::test] +// Test with `repartition_sorts` disabled, causing a full resort of the data +async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_repartition_sorts_false( +) -> Result<()> { + assert_snapshot!( + union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl(false).await?, + @r" + Input Plan: + OutputRequirementExec: order_by=[(nullable_col@0, asc)], dist_by=SinglePartition + CoalescePartitionsExec + UnionExec + SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet + + Optimized Plan: + OutputRequirementExec: order_by=[(nullable_col@0, asc)], dist_by=SinglePartition + SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet + "); + Ok(()) +} + +async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl( + repartition_sorts: bool, +) -> Result { + let schema = create_test_schema()?; + + // Source 1, will be sorted explicitly (on `nullable_col`) + let source1 = parquet_exec(schema.clone()); + let ordering1 = [sort_expr("nullable_col", &schema)].into(); + let sort1 = sort_exec(ordering1, source1.clone()); + + // Source 2, pre-sorted (on `nullable_col`) + let parquet_ordering: LexOrdering = [sort_expr("nullable_col", &schema)].into(); + let source2 = parquet_exec_with_sort(schema.clone(), vec![parquet_ordering.clone()]); + + let union = union_exec(vec![sort1, source2]); + + let coalesced = coalesce_partitions_exec(union); + + // Required sorted / single partitioned output + let requirement = [PhysicalSortRequirement::new( + col("nullable_col", &schema)?, + Some(SortOptions::new(false, true)), + )] + .into(); + let physical_plan = Arc::new(OutputRequirementExec::new( + coalesced, + Some(OrderingRequirements::new(requirement)), + Distribution::SinglePartition, + None, + )); + + let test = + EnforceSortingTest::new(physical_plan).with_repartition_sorts(repartition_sorts); + Ok(test.run()) +} + #[tokio::test] async fn test_union_inputs_different_sorted3() -> Result<()> { let schema = create_test_schema()?; @@ -667,12 +760,12 @@ async fn test_soft_hard_requirements_remove_soft_requirement() -> Result<()> { let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet "#); @@ -716,13 +809,13 @@ async fn test_soft_hard_requirements_remove_soft_requirement_without_pushdowns( assert_snapshot!(test.run(), @r#" Input Plan: ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as count] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as count] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet "#); @@ -763,13 +856,13 @@ async fn test_soft_hard_requirements_remove_soft_requirement_without_pushdowns( let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -824,15 +917,15 @@ async fn test_soft_hard_requirements_multiple_soft_requirements() -> Result<()> let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -889,17 +982,17 @@ async fn test_soft_hard_requirements_multiple_soft_requirements() -> Result<()> let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -961,14 +1054,14 @@ async fn test_soft_hard_requirements_multiple_sorts() -> Result<()> { Input Plan: SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -1023,16 +1116,16 @@ async fn test_soft_hard_requirements_with_multiple_soft_requirements_and_output_ assert_snapshot!(test.run(), @r#" Input Plan: OutputRequirementExec: order_by=[(non_nullable_col@1, asc)], dist_by=SinglePartition - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: OutputRequirementExec: order_by=[(non_nullable_col@1, asc)], dist_by=SinglePartition - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet "#); @@ -1081,7 +1174,7 @@ async fn test_window_multi_path_sort() -> Result<()> { let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortPreservingMergeExec: [nullable_col@0 DESC NULLS LAST] UnionExec SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -1090,7 +1183,7 @@ async fn test_window_multi_path_sort() -> Result<()> { DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet Optimized Plan: - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] SortPreservingMergeExec: [nullable_col@0 ASC] UnionExec DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], file_type=parquet @@ -1122,7 +1215,7 @@ async fn test_window_multi_path_sort2() -> Result<()> { let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC] UnionExec SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false] @@ -1131,7 +1224,7 @@ async fn test_window_multi_path_sort2() -> Result<()> { DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortPreservingMergeExec: [nullable_col@0 ASC] UnionExec DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet @@ -1678,7 +1771,7 @@ async fn test_window_multi_layer_requirement() -> Result<()> { EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortPreservingMergeExec: [a@0 ASC, b@1 ASC] RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC, b@1 ASC RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 @@ -1686,7 +1779,7 @@ async fn test_window_multi_layer_requirement() -> Result<()> { DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortPreservingMergeExec: [a@0 ASC, b@1 ASC] SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true] RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10 @@ -1783,18 +1876,18 @@ async fn test_remove_unnecessary_sort_window_multilayer() -> Result<()> { EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] FilterExec: NOT non_nullable_col@1 SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] CoalesceBatchesExec: target_batch_size=128 SortExec: expr=[non_nullable_col@1 DESC], preserve_partitioning=[false] DataSourceExec: partitions=1, partition_sizes=[0] Optimized Plan: - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] FilterExec: NOT non_nullable_col@1 - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] CoalesceBatchesExec: target_batch_size=128 SortExec: expr=[non_nullable_col@1 DESC], preserve_partitioning=[false] DataSourceExec: partitions=1, partition_sizes=[0] @@ -2238,17 +2331,17 @@ async fn test_multiple_sort_window_exec() -> Result<()> { EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false] DataSourceExec: partitions=1, partition_sizes=[0] Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false] DataSourceExec: partitions=1, partition_sizes=[0] "#); @@ -2273,7 +2366,7 @@ async fn test_commutativity() -> Result<()> { assert_snapshot!(displayable(orig_plan.as_ref()).indent(true), @r#" SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false] RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: partitions=1, partition_sizes=[0] "#); @@ -2483,7 +2576,6 @@ async fn test_not_replaced_with_partial_sort_for_unbounded_input() -> Result<()> Ok(()) } -// Test that verifies that an orthogonal sort (a sort on columns not in the input ordering) #[test] fn test_removes_unused_orthogonal_sort() -> Result<()> { let schema = create_test_schema3()?; diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs index 7d6c0484b624..ef233e222912 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs @@ -229,11 +229,11 @@ fn test_window_partial_constant_and_set_monotonicity_0() { @ r#" Input Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet Optimized Plan: - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -253,11 +253,11 @@ fn test_window_partial_constant_and_set_monotonicity_1() { @ r#" Input Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet Optimized Plan: - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -275,15 +275,15 @@ fn test_window_partial_constant_and_set_monotonicity_2() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -299,15 +299,15 @@ fn test_window_partial_constant_and_set_monotonicity_3() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -323,16 +323,16 @@ fn test_window_partial_constant_and_set_monotonicity_4() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -348,16 +348,16 @@ fn test_window_partial_constant_and_set_monotonicity_5() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[non_nullable_col@1 DESC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[non_nullable_col@1 DESC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -SortExec: expr=[non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + SortExec: expr=[non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -373,16 +373,16 @@ fn test_window_partial_constant_and_set_monotonicity_6() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[min@2 ASC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[min@2 ASC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -398,16 +398,16 @@ fn test_window_partial_constant_and_set_monotonicity_7() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -427,15 +427,15 @@ fn test_window_partial_constant_and_set_monotonicity_8() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -451,15 +451,15 @@ fn test_window_partial_constant_and_set_monotonicity_9() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -477,7 +477,7 @@ fn test_window_partial_constant_and_set_monotonicity_10() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -497,7 +497,7 @@ fn test_window_partial_constant_and_set_monotonicity_11() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -522,7 +522,7 @@ fn test_window_partial_constant_and_set_monotonicity_12() { @ r#" Input / Optimized Plan: SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -543,7 +543,7 @@ fn test_window_partial_constant_and_set_monotonicity_13() { @ r#" Input / Optimized Plan: SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -564,7 +564,7 @@ fn test_window_partial_constant_and_set_monotonicity_14() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -585,7 +585,7 @@ fn test_window_partial_constant_and_set_monotonicity_15() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -608,15 +608,15 @@ fn test_window_partial_constant_and_set_monotonicity_16() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -633,15 +633,15 @@ fn test_window_partial_constant_and_set_monotonicity_17() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[max@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[max@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -658,15 +658,15 @@ fn test_window_partial_constant_and_set_monotonicity_18() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[min@2 ASC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[min@2 ASC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -685,7 +685,7 @@ fn test_window_partial_constant_and_set_monotonicity_19() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -710,7 +710,7 @@ fn test_window_partial_constant_and_set_monotonicity_20() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -729,15 +729,15 @@ fn test_window_partial_constant_and_set_monotonicity_21() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -756,7 +756,7 @@ fn test_window_partial_constant_and_set_monotonicity_22() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -777,7 +777,7 @@ fn test_window_partial_constant_and_set_monotonicity_23() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -800,15 +800,15 @@ fn test_window_partial_constant_and_set_monotonicity_24() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -827,7 +827,7 @@ fn test_window_partial_constant_and_set_monotonicity_25() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -847,7 +847,7 @@ fn test_window_partial_constant_and_set_monotonicity_26() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "#); } @@ -867,7 +867,7 @@ fn test_window_partial_constant_and_set_monotonicity_27() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "#); } @@ -893,7 +893,7 @@ fn test_window_partial_constant_and_set_monotonicity_28() { @ r#" Input / Optimized Plan: SortExec: expr=[count@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -912,15 +912,15 @@ fn test_window_partial_constant_and_set_monotonicity_29() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"#) + Optimized Plan: + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "#) } // Case 30: @@ -937,7 +937,7 @@ fn test_window_partial_constant_and_set_monotonicity_30() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "#); } @@ -957,7 +957,7 @@ fn test_window_partial_constant_and_set_monotonicity_31() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -981,15 +981,15 @@ fn test_window_partial_constant_and_set_monotonicity_32() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1008,7 +1008,7 @@ fn test_window_partial_constant_and_set_monotonicity_33() { @ r#" Input / Optimized Plan: SortExec: expr=[max@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1027,15 +1027,15 @@ fn test_window_partial_constant_and_set_monotonicity_34() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } // Case 35: @@ -1053,7 +1053,7 @@ fn test_window_partial_constant_and_set_monotonicity_35() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1077,15 +1077,15 @@ fn test_window_partial_constant_and_set_monotonicity_36() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1102,15 +1102,15 @@ fn test_window_partial_constant_and_set_monotonicity_37() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1129,7 +1129,7 @@ fn test_window_partial_constant_and_set_monotonicity_38() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1149,7 +1149,7 @@ fn test_window_partial_constant_and_set_monotonicity_39() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1173,15 +1173,15 @@ fn test_window_partial_constant_and_set_monotonicity_40() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1200,7 +1200,7 @@ fn test_window_partial_constant_and_set_monotonicity_41() { @ r#" Input / Optimized Plan: SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1221,7 +1221,7 @@ fn test_window_partial_constant_and_set_monotonicity_42() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1242,7 +1242,7 @@ fn test_window_partial_constant_and_set_monotonicity_43() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1267,7 +1267,7 @@ fn test_window_partial_constant_and_set_monotonicity_44() { @ r#" Input / Optimized Plan: SortExec: expr=[count@2 ASC], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1288,7 +1288,7 @@ fn test_window_partial_constant_and_set_monotonicity_45() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1307,15 +1307,15 @@ fn test_window_partial_constant_and_set_monotonicity_46() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1331,15 +1331,15 @@ fn test_window_partial_constant_and_set_monotonicity_47() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1361,15 +1361,15 @@ fn test_window_partial_constant_and_set_monotonicity_48() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1387,7 +1387,7 @@ fn test_window_partial_constant_and_set_monotonicity_49() { @ r#" Input / Optimized Plan: SortExec: expr=[max@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1406,15 +1406,15 @@ fn test_window_partial_constant_and_set_monotonicity_50() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1432,7 +1432,7 @@ fn test_window_partial_constant_and_set_monotonicity_51() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1458,7 +1458,7 @@ fn test_window_partial_constant_and_set_monotonicity_52() { @ r#" Input / Optimized Plan: SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1479,7 +1479,7 @@ fn test_window_partial_constant_and_set_monotonicity_53() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1499,7 +1499,7 @@ fn test_window_partial_constant_and_set_monotonicity_54() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1517,15 +1517,15 @@ fn test_window_partial_constant_and_set_monotonicity_55() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1547,15 +1547,15 @@ fn test_window_partial_constant_and_set_monotonicity_56() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1574,7 +1574,7 @@ fn test_window_partial_constant_and_set_monotonicity_57() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1595,7 +1595,7 @@ fn test_window_partial_constant_and_set_monotonicity_58() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1615,7 +1615,7 @@ fn test_window_partial_constant_and_set_monotonicity_59() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1641,7 +1641,7 @@ fn test_window_partial_constant_and_set_monotonicity_60() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1662,7 +1662,7 @@ fn test_window_partial_constant_and_set_monotonicity_61() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1683,7 +1683,7 @@ fn test_window_partial_constant_and_set_monotonicity_62() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1701,15 +1701,15 @@ fn test_window_partial_constant_and_set_monotonicity_63() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } // =============================================REGION ENDS============================================= diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs index b91c1732260c..31909415a286 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs @@ -543,11 +543,11 @@ fn test_push_down_through_transparent_nodes() { } #[test] -fn test_no_pushdown_through_aggregates() { - // There are 2 important points here: - // 1. The outer filter **is not** pushed down at all because we haven't implemented pushdown support - // yet for AggregateExec. - // 2. The inner filter **is** pushed down into the DataSource. +fn test_pushdown_through_aggregates_on_grouping_columns() { + // Test that filters on grouping columns can be pushed through AggregateExec. + // This test has two filters: + // 1. An inner filter (a@0 = foo) below the aggregate - gets pushed to DataSource + // 2. An outer filter (b@1 = bar) above the aggregate - also gets pushed through because 'b' is a grouping column let scan = TestScanBuilder::new(schema()).with_support(true).build(); let coalesce = Arc::new(CoalesceBatchesExec::new(scan, 10)); @@ -586,7 +586,7 @@ fn test_no_pushdown_through_aggregates() { let predicate = col_lit_predicate("b", "bar", &schema()); let plan = Arc::new(FilterExec::try_new(predicate, coalesce).unwrap()); - // expect the predicate to be pushed down into the DataSource + // Both filters should be pushed down to the DataSource since both reference grouping columns insta::assert_snapshot!( OptimizationTest::new(plan, FilterPushdown::new(), true), @r" @@ -600,11 +600,10 @@ fn test_no_pushdown_through_aggregates() { - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true output: Ok: - - FilterExec: b@1 = bar - - CoalesceBatchesExec: target_batch_size=100 - - AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt], ordering_mode=PartiallySorted([0]) - - CoalesceBatchesExec: target_batch_size=10 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo + - CoalesceBatchesExec: target_batch_size=100 + - AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt], ordering_mode=Sorted + - CoalesceBatchesExec: target_batch_size=10 + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo AND b@1 = bar " ); } @@ -860,20 +859,17 @@ async fn test_topk_filter_passes_through_coalesce_partitions() { ]; // Create a source that supports all batches - let source = Arc::new(TestSource::new(true, batches)); - - let base_config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test://").unwrap(), - Arc::clone(&schema()), - source, - ) - .with_file_groups(vec![ - // Partition 0 - FileGroup::new(vec![PartitionedFile::new("test1.parquet", 123)]), - // Partition 1 - FileGroup::new(vec![PartitionedFile::new("test2.parquet", 123)]), - ]) - .build(); + let source = Arc::new(TestSource::new(schema(), true, batches)); + + let base_config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test://").unwrap(), source) + .with_file_groups(vec![ + // Partition 0 + FileGroup::new(vec![PartitionedFile::new("test1.parquet", 123)]), + // Partition 1 + FileGroup::new(vec![PartitionedFile::new("test2.parquet", 123)]), + ]) + .build(); let scan = DataSourceExec::from_data_source(base_config); @@ -1892,3 +1888,445 @@ fn col_lit_predicate( Arc::new(Literal::new(scalar_value)), )) } + +#[tokio::test] +async fn test_aggregate_filter_pushdown() { + // Test that filters can pass through AggregateExec even with aggregate functions + // when the filter references grouping columns + // Simulates: SELECT a, COUNT(b) FROM table WHERE a = 'x' GROUP BY a + + let batches = + vec![ + record_batch!(("a", Utf8, ["x", "y"]), ("b", Utf8, ["foo", "bar"])).unwrap(), + ]; + + let scan = TestScanBuilder::new(schema()) + .with_support(true) + .with_batches(batches) + .build(); + + // Create an aggregate: GROUP BY a with COUNT(b) + let group_by = PhysicalGroupBy::new_single(vec![( + col("a", &schema()).unwrap(), + "a".to_string(), + )]); + + // Add COUNT aggregate + let count_expr = + AggregateExprBuilder::new(count_udaf(), vec![col("b", &schema()).unwrap()]) + .schema(schema()) + .alias("count") + .build() + .unwrap(); + + let aggregate = Arc::new( + AggregateExec::try_new( + AggregateMode::Partial, + group_by, + vec![count_expr.into()], // Has aggregate function + vec![None], // No filter on the aggregate function + Arc::clone(&scan), + schema(), + ) + .unwrap(), + ); + + // Add a filter on the grouping column 'a' + let predicate = col_lit_predicate("a", "x", &schema()); + let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap()) + as Arc; + + // Even with aggregate functions, filter on grouping column should be pushed through + insta::assert_snapshot!( + OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true), + @r" + OptimizationTest: + input: + - FilterExec: a@0 = x + - AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + output: + Ok: + - AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count], ordering_mode=Sorted + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = x + " + ); +} + +#[tokio::test] +async fn test_no_pushdown_filter_on_aggregate_result() { + // Test that filters on aggregate results (not grouping columns) are NOT pushed through + // SELECT a, COUNT(b) as cnt FROM table GROUP BY a HAVING cnt > 5 + // The filter on 'cnt' cannot be pushed down because it's an aggregate result + + let batches = + vec![ + record_batch!(("a", Utf8, ["x", "y"]), ("b", Utf8, ["foo", "bar"])).unwrap(), + ]; + + let scan = TestScanBuilder::new(schema()) + .with_support(true) + .with_batches(batches) + .build(); + + // Create an aggregate: GROUP BY a with COUNT(b) + let group_by = PhysicalGroupBy::new_single(vec![( + col("a", &schema()).unwrap(), + "a".to_string(), + )]); + + // Add COUNT aggregate + let count_expr = + AggregateExprBuilder::new(count_udaf(), vec![col("b", &schema()).unwrap()]) + .schema(schema()) + .alias("count") + .build() + .unwrap(); + + let aggregate = Arc::new( + AggregateExec::try_new( + AggregateMode::Partial, + group_by, + vec![count_expr.into()], + vec![None], + Arc::clone(&scan), + schema(), + ) + .unwrap(), + ); + + // Add a filter on the aggregate output column + // This simulates filtering on COUNT result, which should NOT be pushed through + let agg_schema = aggregate.schema(); + let predicate = Arc::new(BinaryExpr::new( + Arc::new(Column::new_with_schema("count[count]", &agg_schema).unwrap()), + Operator::Gt, + Arc::new(Literal::new(ScalarValue::Int64(Some(5)))), + )); + let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap()) + as Arc; + + // The filter should NOT be pushed through the aggregate since it's on an aggregate result + insta::assert_snapshot!( + OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true), + @r" + OptimizationTest: + input: + - FilterExec: count[count]@1 > 5 + - AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + output: + Ok: + - FilterExec: count[count]@1 > 5 + - AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + " + ); +} + +#[test] +fn test_pushdown_filter_on_non_first_grouping_column() { + // Test that filters on non-first grouping columns are still pushed down + // SELECT a, b, count(*) as cnt FROM table GROUP BY a, b HAVING b = 'bar' + // The filter is on 'b' (second grouping column), should push down + let scan = TestScanBuilder::new(schema()).with_support(true).build(); + + let aggregate_expr = + vec![ + AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()]) + .schema(schema()) + .alias("cnt") + .build() + .map(Arc::new) + .unwrap(), + ]; + + let group_by = PhysicalGroupBy::new_single(vec![ + (col("a", &schema()).unwrap(), "a".to_string()), + (col("b", &schema()).unwrap(), "b".to_string()), + ]); + + let aggregate = Arc::new( + AggregateExec::try_new( + AggregateMode::Final, + group_by, + aggregate_expr.clone(), + vec![None], + scan, + schema(), + ) + .unwrap(), + ); + + let predicate = col_lit_predicate("b", "bar", &schema()); + let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap()); + + insta::assert_snapshot!( + OptimizationTest::new(plan, FilterPushdown::new(), true), + @r" + OptimizationTest: + input: + - FilterExec: b@1 = bar + - AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + output: + Ok: + - AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt], ordering_mode=PartiallySorted([1]) + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=b@1 = bar + " + ); +} + +#[test] +fn test_no_pushdown_grouping_sets_filter_on_missing_column() { + // Test that filters on columns missing from some grouping sets are NOT pushed through + let scan = TestScanBuilder::new(schema()).with_support(true).build(); + + let aggregate_expr = + vec![ + AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()]) + .schema(schema()) + .alias("cnt") + .build() + .map(Arc::new) + .unwrap(), + ]; + + // Create GROUPING SETS with (a, b) and (b) + let group_by = PhysicalGroupBy::new( + vec![ + (col("a", &schema()).unwrap(), "a".to_string()), + (col("b", &schema()).unwrap(), "b".to_string()), + ], + vec![ + ( + Arc::new(Literal::new(ScalarValue::Utf8(None))), + "a".to_string(), + ), + ( + Arc::new(Literal::new(ScalarValue::Utf8(None))), + "b".to_string(), + ), + ], + vec![ + vec![false, false], // (a, b) - both present + vec![true, false], // (b) - a is NULL, b present + ], + ); + + let aggregate = Arc::new( + AggregateExec::try_new( + AggregateMode::Final, + group_by, + aggregate_expr.clone(), + vec![None], + scan, + schema(), + ) + .unwrap(), + ); + + // Filter on column 'a' which is missing in the second grouping set, should not be pushed down + let predicate = col_lit_predicate("a", "foo", &schema()); + let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap()); + + insta::assert_snapshot!( + OptimizationTest::new(plan, FilterPushdown::new(), true), + @r" + OptimizationTest: + input: + - FilterExec: a@0 = foo + - AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + output: + Ok: + - FilterExec: a@0 = foo + - AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + " + ); +} + +#[test] +fn test_pushdown_grouping_sets_filter_on_common_column() { + // Test that filters on columns present in ALL grouping sets ARE pushed through + let scan = TestScanBuilder::new(schema()).with_support(true).build(); + + let aggregate_expr = + vec![ + AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()]) + .schema(schema()) + .alias("cnt") + .build() + .map(Arc::new) + .unwrap(), + ]; + + // Create GROUPING SETS with (a, b) and (b) + let group_by = PhysicalGroupBy::new( + vec![ + (col("a", &schema()).unwrap(), "a".to_string()), + (col("b", &schema()).unwrap(), "b".to_string()), + ], + vec![ + ( + Arc::new(Literal::new(ScalarValue::Utf8(None))), + "a".to_string(), + ), + ( + Arc::new(Literal::new(ScalarValue::Utf8(None))), + "b".to_string(), + ), + ], + vec![ + vec![false, false], // (a, b) - both present + vec![true, false], // (b) - a is NULL, b present + ], + ); + + let aggregate = Arc::new( + AggregateExec::try_new( + AggregateMode::Final, + group_by, + aggregate_expr.clone(), + vec![None], + scan, + schema(), + ) + .unwrap(), + ); + + // Filter on column 'b' which is present in all grouping sets will be pushed down + let predicate = col_lit_predicate("b", "bar", &schema()); + let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap()); + + insta::assert_snapshot!( + OptimizationTest::new(plan, FilterPushdown::new(), true), + @r" + OptimizationTest: + input: + - FilterExec: b@1 = bar + - AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + output: + Ok: + - AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt], ordering_mode=PartiallySorted([1]) + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=b@1 = bar + " + ); +} + +#[test] +fn test_pushdown_with_empty_group_by() { + // Test that filters can be pushed down when GROUP BY is empty (no grouping columns) + // SELECT count(*) as cnt FROM table WHERE a = 'foo' + // There are no grouping columns, so the filter should still push down + let scan = TestScanBuilder::new(schema()).with_support(true).build(); + + let aggregate_expr = + vec![ + AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()]) + .schema(schema()) + .alias("cnt") + .build() + .map(Arc::new) + .unwrap(), + ]; + + // Empty GROUP BY - no grouping columns + let group_by = PhysicalGroupBy::new_single(vec![]); + + let aggregate = Arc::new( + AggregateExec::try_new( + AggregateMode::Final, + group_by, + aggregate_expr.clone(), + vec![None], + scan, + schema(), + ) + .unwrap(), + ); + + // Filter on 'a' + let predicate = col_lit_predicate("a", "foo", &schema()); + let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap()); + + // The filter should be pushed down even with empty GROUP BY + insta::assert_snapshot!( + OptimizationTest::new(plan, FilterPushdown::new(), true), + @r" + OptimizationTest: + input: + - FilterExec: a@0 = foo + - AggregateExec: mode=Final, gby=[], aggr=[cnt] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + output: + Ok: + - AggregateExec: mode=Final, gby=[], aggr=[cnt] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo + " + ); +} + +#[test] +fn test_pushdown_with_computed_grouping_key() { + // Test filter pushdown with computed grouping expression + // SELECT (c + 1.0) as c_plus_1, count(*) FROM table WHERE c > 5.0 GROUP BY (c + 1.0) + + let scan = TestScanBuilder::new(schema()).with_support(true).build(); + + let predicate = Arc::new(BinaryExpr::new( + col("c", &schema()).unwrap(), + Operator::Gt, + Arc::new(Literal::new(ScalarValue::Float64(Some(5.0)))), + )) as Arc; + let filter = Arc::new(FilterExec::try_new(predicate, scan).unwrap()); + + let aggregate_expr = + vec![ + AggregateExprBuilder::new(count_udaf(), vec![col("a", &schema()).unwrap()]) + .schema(schema()) + .alias("cnt") + .build() + .map(Arc::new) + .unwrap(), + ]; + + let c_plus_one = Arc::new(BinaryExpr::new( + col("c", &schema()).unwrap(), + Operator::Plus, + Arc::new(Literal::new(ScalarValue::Float64(Some(1.0)))), + )) as Arc; + + let group_by = + PhysicalGroupBy::new_single(vec![(c_plus_one, "c_plus_1".to_string())]); + + let plan = Arc::new( + AggregateExec::try_new( + AggregateMode::Final, + group_by, + aggregate_expr.clone(), + vec![None], + filter, + schema(), + ) + .unwrap(), + ); + + // The filter should be pushed down because 'c' is extracted from the grouping expression (c + 1.0) + insta::assert_snapshot!( + OptimizationTest::new(plan, FilterPushdown::new(), true), + @r" + OptimizationTest: + input: + - AggregateExec: mode=Final, gby=[c@2 + 1 as c_plus_1], aggr=[cnt] + - FilterExec: c@2 > 5 + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + output: + Ok: + - AggregateExec: mode=Final, gby=[c@2 + 1 as c_plus_1], aggr=[cnt] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=c@2 > 5 + " + ); +} diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs index f05f3f00281d..2bd70221f41e 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs @@ -52,7 +52,7 @@ use std::{ pub struct TestOpener { batches: Vec, batch_size: Option, - schema: Option, + schema: SchemaRef, projection: Option>, predicate: Option>, } @@ -70,23 +70,23 @@ impl FileOpener for TestOpener { } batches = new_batches.into_iter().collect(); } - if let Some(schema) = &self.schema { - let factory = DefaultSchemaAdapterFactory::from_schema(Arc::clone(schema)); - let (mapper, projection) = factory.map_schema(&batches[0].schema()).unwrap(); - let mut new_batches = Vec::new(); - for batch in batches { - let batch = if let Some(predicate) = &self.predicate { - batch_filter(&batch, predicate)? - } else { - batch - }; - let batch = batch.project(&projection).unwrap(); - let batch = mapper.map_batch(batch).unwrap(); - new_batches.push(batch); - } - batches = new_batches; + let factory = DefaultSchemaAdapterFactory::from_schema(Arc::clone(&self.schema)); + let (mapper, projection) = factory.map_schema(&batches[0].schema()).unwrap(); + let mut new_batches = Vec::new(); + for batch in batches { + let batch = if let Some(predicate) = &self.predicate { + batch_filter(&batch, predicate)? + } else { + batch + }; + + let batch = batch.project(&projection).unwrap(); + let batch = mapper.map_batch(batch).unwrap(); + new_batches.push(batch); } + batches = new_batches; + if let Some(projection) = &self.projection { batches = batches .into_iter() @@ -101,26 +101,35 @@ impl FileOpener for TestOpener { } /// A placeholder data source that accepts filter pushdown -#[derive(Clone, Default)] +#[derive(Clone)] pub struct TestSource { support: bool, predicate: Option>, statistics: Option, batch_size: Option, batches: Vec, - schema: Option, + schema: SchemaRef, metrics: ExecutionPlanMetricsSet, projection: Option>, schema_adapter_factory: Option>, + table_schema: datafusion_datasource::TableSchema, } impl TestSource { - pub fn new(support: bool, batches: Vec) -> Self { + pub fn new(schema: SchemaRef, support: bool, batches: Vec) -> Self { + let table_schema = + datafusion_datasource::TableSchema::new(Arc::clone(&schema), vec![]); Self { + schema, support, metrics: ExecutionPlanMetricsSet::new(), batches, - ..Default::default() + predicate: None, + statistics: None, + batch_size: None, + projection: None, + schema_adapter_factory: None, + table_schema, } } } @@ -135,7 +144,7 @@ impl FileSource for TestSource { Arc::new(TestOpener { batches: self.batches.clone(), batch_size: self.batch_size, - schema: self.schema.clone(), + schema: Arc::clone(&self.schema), projection: self.projection.clone(), predicate: self.predicate.clone(), }) @@ -156,16 +165,9 @@ impl FileSource for TestSource { }) } - fn with_schema(&self, schema: SchemaRef) -> Arc { - Arc::new(TestSource { - schema: Some(schema), - ..self.clone() - }) - } - fn with_projection(&self, config: &FileScanConfig) -> Arc { Arc::new(TestSource { - projection: config.projection.clone(), + projection: config.projection_exprs.as_ref().map(|p| p.column_indices()), ..self.clone() }) } @@ -255,6 +257,10 @@ impl FileSource for TestSource { fn schema_adapter_factory(&self) -> Option> { self.schema_adapter_factory.clone() } + + fn table_schema(&self) -> &datafusion_datasource::TableSchema { + &self.table_schema + } } #[derive(Debug, Clone)] @@ -284,14 +290,15 @@ impl TestScanBuilder { } pub fn build(self) -> Arc { - let source = Arc::new(TestSource::new(self.support, self.batches)); - let base_config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test://").unwrap(), + let source = Arc::new(TestSource::new( Arc::clone(&self.schema), - source, - ) - .with_file(PartitionedFile::new("test.parquet", 123)) - .build(); + self.support, + self.batches, + )); + let base_config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test://").unwrap(), source) + .with_file(PartitionedFile::new("test.parquet", 123)) + .build(); DataSourceExec::from_data_source(base_config) } } diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index c51a5e02c9c3..9d39a80fb9df 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -24,9 +24,10 @@ use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::physical_plan::CsvSource; use datafusion::datasource::source::DataSourceExec; -use datafusion_common::config::ConfigOptions; +use datafusion_common::config::{ConfigOptions, CsvOptions}; use datafusion_common::{JoinSide, JoinType, NullEquality, Result, ScalarValue}; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; +use datafusion_datasource::TableSchema; use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::{ @@ -384,14 +385,19 @@ fn create_simple_csv_exec() -> Arc { Field::new("d", DataType::Int32, true), Field::new("e", DataType::Int32, true), ])); - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema, - Arc::new(CsvSource::new(false, 0, 0)), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_projection(Some(vec![0, 1, 2, 3, 4])) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), { + let options = CsvOptions { + has_header: Some(false), + delimiter: 0, + quote: 0, + ..Default::default() + }; + Arc::new(CsvSource::new(schema.clone()).with_csv_options(options)) + }) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .with_projection_indices(Some(vec![0, 1, 2, 3, 4])) + .build(); DataSourceExec::from_data_source(config) } @@ -403,14 +409,19 @@ fn create_projecting_csv_exec() -> Arc { Field::new("c", DataType::Int32, true), Field::new("d", DataType::Int32, true), ])); - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema, - Arc::new(CsvSource::new(false, 0, 0)), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_projection(Some(vec![3, 2, 1])) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), { + let options = CsvOptions { + has_header: Some(false), + delimiter: 0, + quote: 0, + ..Default::default() + }; + Arc::new(CsvSource::new(schema.clone()).with_csv_options(options)) + }) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .with_projection_indices(Some(vec![3, 2, 1])) + .build(); DataSourceExec::from_data_source(config) } @@ -1589,14 +1600,22 @@ fn partitioned_data_source() -> Arc { Field::new("string_col", DataType::Utf8, true), ])); + let options = CsvOptions { + has_header: Some(false), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let table_schema = TableSchema::new( + Arc::clone(&file_schema), + vec![Arc::new(Field::new("partition_col", DataType::Utf8, true))], + ); let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), - file_schema.clone(), - Arc::new(CsvSource::default()), + Arc::new(CsvSource::new(table_schema).with_csv_options(options)), ) .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_table_partition_cols(vec![Field::new("partition_col", DataType::Utf8, true)]) - .with_projection(Some(vec![0, 1, 2])) + .with_projection_indices(Some(vec![0, 1, 2])) .build(); DataSourceExec::from_data_source(config) diff --git a/datafusion/core/tests/physical_optimizer/sanity_checker.rs b/datafusion/core/tests/physical_optimizer/sanity_checker.rs index ce6eb13c86c4..9867ed173341 100644 --- a/datafusion/core/tests/physical_optimizer/sanity_checker.rs +++ b/datafusion/core/tests/physical_optimizer/sanity_checker.rs @@ -421,7 +421,7 @@ async fn test_bounded_window_agg_sort_requirement() -> Result<()> { assert_snapshot!( actual, @r#" - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false] DataSourceExec: partitions=1, partition_sizes=[0] "# @@ -449,7 +449,7 @@ async fn test_bounded_window_agg_no_sort_requirement() -> Result<()> { assert_snapshot!( actual, @r#" - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: partitions=1, partition_sizes=[0] "# ); diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs index 8ca33f3d4abb..60fec2243621 100644 --- a/datafusion/core/tests/physical_optimizer/test_utils.rs +++ b/datafusion/core/tests/physical_optimizer/test_utils.rs @@ -73,8 +73,7 @@ use datafusion_physical_plan::{ pub fn parquet_exec(schema: SchemaRef) -> Arc { let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), - schema, - Arc::new(ParquetSource::default()), + Arc::new(ParquetSource::new(schema)), ) .with_file(PartitionedFile::new("x".to_string(), 100)) .build(); @@ -89,8 +88,7 @@ pub(crate) fn parquet_exec_with_sort( ) -> Arc { let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), - schema, - Arc::new(ParquetSource::default()), + Arc::new(ParquetSource::new(schema)), ) .with_file(PartitionedFile::new("x".to_string(), 100)) .with_output_ordering(output_ordering) @@ -127,8 +125,7 @@ pub(crate) fn parquet_exec_with_stats(file_size: u64) -> Arc { let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(ParquetSource::new(Default::default())), + Arc::new(ParquetSource::new(schema())), ) .with_file(PartitionedFile::new("x".to_string(), file_size)) .with_statistics(statistics) diff --git a/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs b/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs index c3c92a9028d6..191529816481 100644 --- a/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs +++ b/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs @@ -27,12 +27,14 @@ use datafusion::datasource::physical_plan::{ }; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::SessionContext; +use datafusion_common::config::CsvOptions; use datafusion_common::ColumnStatistics; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::schema_adapter::{ SchemaAdapter, SchemaAdapterFactory, SchemaMapper, }; use datafusion_datasource::source::DataSourceExec; +use datafusion_datasource::TableSchema; use datafusion_execution::object_store::ObjectStoreUrl; use object_store::{memory::InMemory, path::Path, ObjectStore}; use parquet::arrow::ArrowWriter; @@ -182,17 +184,17 @@ async fn test_parquet_integration_with_schema_adapter() -> Result<()> { let ctx = SessionContext::new(); ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); - // Create a ParquetSource with the adapter factory - let file_source = ParquetSource::default() - .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - // Create a table schema with uppercase column names let table_schema = Arc::new(Schema::new(vec![ Field::new("ID", DataType::Int32, false), Field::new("NAME", DataType::Utf8, true), ])); - let config = FileScanConfigBuilder::new(store_url, table_schema.clone(), file_source) + // Create a ParquetSource with the adapter factory + let file_source = ParquetSource::new(table_schema.clone()) + .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; + + let config = FileScanConfigBuilder::new(store_url, file_source) .with_file(PartitionedFile::new(path, file_size)) .build(); @@ -245,10 +247,10 @@ async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); // Create a ParquetSource with the adapter factory - let file_source = ParquetSource::default() + let file_source = ParquetSource::new(batch.schema()) .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - let config = FileScanConfigBuilder::new(store_url, batch.schema(), file_source) + let config = FileScanConfigBuilder::new(store_url, file_source) .with_file(PartitionedFile::new(path, file_size)) .build(); @@ -282,9 +284,12 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { // Create a test factory let factory = Arc::new(UppercaseAdapterFactory {}); - // Test ArrowSource + // Test ArrowFileSource { - let source = ArrowSource::default(); + let schema = + Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let table_schema = TableSchema::new(schema, vec![]); + let source = ArrowSource::new_file_source(table_schema); let source_with_adapter = source .clone() .with_schema_adapter_factory(factory.clone()) @@ -304,7 +309,9 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { // Test ParquetSource #[cfg(feature = "parquet")] { - let source = ParquetSource::default(); + let schema = + Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let source = ParquetSource::new(schema); let source_with_adapter = source .clone() .with_schema_adapter_factory(factory.clone()) @@ -323,7 +330,15 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { // Test CsvSource { - let source = CsvSource::default(); + let schema = + Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let options = CsvOptions { + has_header: Some(true), + delimiter: b',', + quote: b'"', + ..Default::default() + }; + let source = CsvSource::new(schema).with_csv_options(options); let source_with_adapter = source .clone() .with_schema_adapter_factory(factory.clone()) @@ -342,7 +357,10 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { // Test JsonSource { - let source = JsonSource::default(); + let schema = + Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let table_schema = TableSchema::new(schema, vec![]); + let source = JsonSource::new(table_schema); let source_with_adapter = source .clone() .with_schema_adapter_factory(factory.clone()) diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index 6d386cc456d8..929de7a5304d 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -61,37 +61,86 @@ async fn explain_analyze_baseline_metrics() { assert_metrics!( &formatted, "AggregateExec: mode=Partial, gby=[]", - "metrics=[output_rows=3, elapsed_compute=" + "metrics=[output_rows=3, elapsed_compute=", + "output_bytes=", + "output_batches=3" ); + assert_metrics!( &formatted, - "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1]", - "metrics=[output_rows=5, elapsed_compute=" + "AggregateExec: mode=Partial, gby=[c1@0 as c1]", + "reduction_factor=5.1% (5/99)" ); + + { + let expected_batch_count_after_repartition = + if cfg!(not(feature = "force_hash_collisions")) { + "output_batches=3" + } else { + "output_batches=1" + }; + + assert_metrics!( + &formatted, + "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1]", + "metrics=[output_rows=5, elapsed_compute=", + "output_bytes=", + expected_batch_count_after_repartition + ); + + assert_metrics!( + &formatted, + "RepartitionExec: partitioning=Hash([c1@0], 3), input_partitions=3", + "metrics=[output_rows=5, elapsed_compute=", + "output_bytes=", + expected_batch_count_after_repartition + ); + + assert_metrics!( + &formatted, + "ProjectionExec: expr=[]", + "metrics=[output_rows=5, elapsed_compute=", + "output_bytes=", + expected_batch_count_after_repartition + ); + + assert_metrics!( + &formatted, + "CoalesceBatchesExec: target_batch_size=4096", + "metrics=[output_rows=5, elapsed_compute", + "output_bytes=", + expected_batch_count_after_repartition + ); + } + assert_metrics!( &formatted, "FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434", - "metrics=[output_rows=99, elapsed_compute=" - ); - assert_metrics!( - &formatted, - "ProjectionExec: expr=[]", - "metrics=[output_rows=5, elapsed_compute=" + "metrics=[output_rows=99, elapsed_compute=", + "output_bytes=", + "output_batches=1" ); + assert_metrics!( &formatted, - "CoalesceBatchesExec: target_batch_size=4096", - "metrics=[output_rows=5, elapsed_compute" + "FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434", + "selectivity=99% (99/100)" ); + assert_metrics!( &formatted, "UnionExec", - "metrics=[output_rows=3, elapsed_compute=" + "metrics=[output_rows=3, elapsed_compute=", + "output_bytes=", + "output_batches=3" ); + assert_metrics!( &formatted, "WindowAggExec", - "metrics=[output_rows=1, elapsed_compute=" + "metrics=[output_rows=1, elapsed_compute=", + "output_bytes=", + "output_batches=1" ); fn expected_to_have_metrics(plan: &dyn ExecutionPlan) -> bool { @@ -193,9 +242,13 @@ async fn explain_analyze_level() { for (level, needle, should_contain) in [ (ExplainAnalyzeLevel::Summary, "spill_count", false), + (ExplainAnalyzeLevel::Summary, "output_batches", false), (ExplainAnalyzeLevel::Summary, "output_rows", true), + (ExplainAnalyzeLevel::Summary, "output_bytes", true), (ExplainAnalyzeLevel::Dev, "spill_count", true), (ExplainAnalyzeLevel::Dev, "output_rows", true), + (ExplainAnalyzeLevel::Dev, "output_bytes", true), + (ExplainAnalyzeLevel::Dev, "output_batches", true), ] { let plan = collect_plan(sql, level).await; assert_eq!( @@ -234,6 +287,37 @@ async fn explain_analyze_level_datasource_parquet() { } } +#[tokio::test] +async fn explain_analyze_parquet_pruning_metrics() { + let table_name = "tpch_lineitem_small"; + let parquet_path = "tests/data/tpch_lineitem_small.parquet"; + let ctx = SessionContext::new(); + ctx.register_parquet(table_name, parquet_path, ParquetReadOptions::default()) + .await + .expect("register parquet table for explain analyze test"); + + // Test scenario: + // This table's l_orderkey has range [1, 7] + // So the following query can't prune the file: + // select * from tpch_lineitem_small where l_orderkey = 5; + // If change filter to `l_orderkey=10`, the whole file can be pruned using stat. + for (l_orderkey, expected_pruning_metrics) in + [(5, "1 total → 1 matched"), (10, "1 total → 0 matched")] + { + let sql = format!( + "explain analyze select * from {table_name} where l_orderkey = {l_orderkey};" + ); + + let plan = + collect_plan_with_context(&sql, &ctx, ExplainAnalyzeLevel::Summary).await; + + let expected_metrics = + format!("files_ranges_pruned_statistics={expected_pruning_metrics}"); + + assert_metrics!(&plan, "DataSourceExec", &expected_metrics); + } +} + #[tokio::test] async fn csv_explain_plans() { // This test verify the look of each plan in its full cycle plan creation @@ -732,14 +816,12 @@ async fn test_physical_plan_display_indent_multi_children() { CoalesceBatchesExec: target_batch_size=4096 HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c1@0, c2@0)], projection=[c1@0] CoalesceBatchesExec: target_batch_size=4096 - RepartitionExec: partitioning=Hash([c1@0], 9000), input_partitions=9000 - RepartitionExec: partitioning=RoundRobinBatch(9000), input_partitions=1 - DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true + RepartitionExec: partitioning=Hash([c1@0], 9000), input_partitions=1 + DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true CoalesceBatchesExec: target_batch_size=4096 - RepartitionExec: partitioning=Hash([c2@0], 9000), input_partitions=9000 - RepartitionExec: partitioning=RoundRobinBatch(9000), input_partitions=1 - ProjectionExec: expr=[c1@0 as c2] - DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true + RepartitionExec: partitioning=Hash([c2@0], 9000), input_partitions=1 + ProjectionExec: expr=[c1@0 as c2] + DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true "### ); } @@ -798,10 +880,29 @@ async fn parquet_explain_analyze() { // should contain aggregated stats assert_contains!(&formatted, "output_rows=8"); - assert_contains!(&formatted, "row_groups_matched_bloom_filter=0"); - assert_contains!(&formatted, "row_groups_pruned_bloom_filter=0"); - assert_contains!(&formatted, "row_groups_matched_statistics=1"); - assert_contains!(&formatted, "row_groups_pruned_statistics=0"); + assert_contains!( + &formatted, + "row_groups_pruned_bloom_filter=1 total \u{2192} 1 matched" + ); + assert_contains!( + &formatted, + "row_groups_pruned_statistics=1 total \u{2192} 1 matched" + ); + + // The order of metrics is expected to be the same as the actual pruning order + // (file-> row-group -> page) + let i_file = formatted.find("files_ranges_pruned_statistics").unwrap(); + let i_rowgroup_stat = formatted.find("row_groups_pruned_statistics").unwrap(); + let i_rowgroup_bloomfilter = + formatted.find("row_groups_pruned_bloom_filter").unwrap(); + let i_page = formatted.find("page_index_rows_pruned").unwrap(); + + assert!( + (i_file < i_rowgroup_stat) + && (i_rowgroup_stat < i_rowgroup_bloomfilter) + && (i_rowgroup_bloomfilter < i_page), + "The parquet pruning metrics should be displayed in an order of: file range -> row group statistics -> row group bloom filter -> page index." + ); } // This test reproduces the behavior described in @@ -941,9 +1042,7 @@ async fn parquet_explain_analyze_verbose() { .to_string(); // should contain the raw per file stats (with the label) - assert_contains!(&formatted, "row_groups_matched_bloom_filter{partition=0"); assert_contains!(&formatted, "row_groups_pruned_bloom_filter{partition=0"); - assert_contains!(&formatted, "row_groups_matched_statistics{partition=0"); assert_contains!(&formatted, "row_groups_pruned_statistics{partition=0"); } @@ -1027,3 +1126,33 @@ async fn csv_explain_analyze_with_statistics() { ", statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]" ); } + +#[tokio::test] +async fn nested_loop_join_selectivity() { + for (join_type, expected_selectivity) in [ + ("INNER", "1% (1/100)"), + ("LEFT", "10% (10/100)"), + ("RIGHT", "10% (10/100)"), + // 1 match + 9 left + 9 right = 19 + ("FULL", "19% (19/100)"), + ] { + let ctx = SessionContext::new(); + let sql = format!( + "EXPLAIN ANALYZE SELECT * \ + FROM generate_series(1, 10) as t1(a) \ + {join_type} JOIN generate_series(1, 10) as t2(b) \ + ON (t1.a + t2.b) = 20" + ); + + let actual = execute_to_batches(&ctx, sql.as_str()).await; + let formatted = arrow::util::pretty::pretty_format_batches(&actual) + .unwrap() + .to_string(); + + assert_metrics!( + &formatted, + "NestedLoopJoinExec", + &format!("selectivity={expected_selectivity}") + ); + } +} diff --git a/datafusion/core/tests/sql/joins.rs b/datafusion/core/tests/sql/joins.rs index 7a5983447592..b64a1e18f3f6 100644 --- a/datafusion/core/tests/sql/joins.rs +++ b/datafusion/core/tests/sql/joins.rs @@ -73,13 +73,11 @@ async fn join_change_in_planner() -> Result<()> { @r" SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a1@0 AS Int64) > CAST(a1@1 AS Int64) + 3 AND CAST(a1@0 AS Int64) < CAST(a1@1 AS Int64) + 10 CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a1@0 ASC NULLS LAST - RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 - StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST] + RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1 + StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST] CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a1@0 ASC NULLS LAST - RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 - StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST] + RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1 + StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST] " ); Ok(()) @@ -134,13 +132,11 @@ async fn join_no_order_on_filter() -> Result<()> { @r" SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a3@0 AS Int64) > CAST(a3@1 AS Int64) + 3 AND CAST(a3@0 AS Int64) < CAST(a3@1 AS Int64) + 10 CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8 - RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 - StreamingTableExec: partition_sizes=1, projection=[a1, a2, a3], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST] + RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1 + StreamingTableExec: partition_sizes=1, projection=[a1, a2, a3], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST] CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8 - RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 - StreamingTableExec: partition_sizes=1, projection=[a1, a2, a3], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST] + RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1 + StreamingTableExec: partition_sizes=1, projection=[a1, a2, a3], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST] " ); Ok(()) @@ -177,13 +173,11 @@ async fn join_change_in_planner_without_sort() -> Result<()> { @r" SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a1@0 AS Int64) > CAST(a1@1 AS Int64) + 3 AND CAST(a1@0 AS Int64) < CAST(a1@1 AS Int64) + 10 CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8 - RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 - StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true + RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1 + StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8 - RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 - StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true + RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1 + StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true " ); Ok(()) diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index e212ee269b15..426ec213b324 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -40,19 +40,24 @@ use std::io::Write; use std::path::PathBuf; use tempfile::TempDir; -/// A macro to assert that some particular line contains two substrings -/// -/// Usage: `assert_metrics!(actual, operator_name, metrics)` +/// A macro to assert that some particular line contains the given substrings /// +/// Usage: `assert_metrics!(actual, operator_name, metrics_1, metrics_2, ...)` macro_rules! assert_metrics { - ($ACTUAL: expr, $OPERATOR_NAME: expr, $METRICS: expr) => { + ($ACTUAL: expr, $OPERATOR_NAME: expr, $($METRICS: expr),+) => { let found = $ACTUAL .lines() - .any(|line| line.contains($OPERATOR_NAME) && line.contains($METRICS)); + .any(|line| line.contains($OPERATOR_NAME) $( && line.contains($METRICS))+); + + let mut metrics = String::new(); + $(metrics.push_str(format!(" '{}',", $METRICS).as_str());)+ + // remove the last `,` from the string + metrics.pop(); + assert!( found, - "Can not find a line with both '{}' and '{}' in\n\n{}", - $OPERATOR_NAME, $METRICS, $ACTUAL + "Cannot find a line with operator name '{}' and metrics containing values {} in :\n\n{}", + $OPERATOR_NAME, metrics, $ACTUAL ); }; } diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs index 2eb3ba36dd90..8a0f62062738 100644 --- a/datafusion/core/tests/sql/select.rs +++ b/datafusion/core/tests/sql/select.rs @@ -222,11 +222,11 @@ async fn test_parameter_invalid_types() -> Result<()> { .collect() .await; assert_snapshot!(results.unwrap_err().strip_backtrace(), - @r#" - type_coercion - caused by - Error during planning: Cannot infer common argument type for comparison operation List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) = Int32 - "#); + @r" + type_coercion + caused by + Error during planning: Cannot infer common argument type for comparison operation List(nullable Int32) = Int32 + "); Ok(()) } diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs b/datafusion/core/tests/user_defined/user_defined_aggregates.rs index 982b4804597e..62e8ab18b9be 100644 --- a/datafusion/core/tests/user_defined/user_defined_aggregates.rs +++ b/datafusion/core/tests/user_defined/user_defined_aggregates.rs @@ -954,13 +954,7 @@ impl AggregateUDFImpl for MetadataBasedAggregateUdf { } fn accumulator(&self, acc_args: AccumulatorArgs) -> Result> { - let input_expr = acc_args - .exprs - .first() - .ok_or(exec_datafusion_err!("Expected one argument"))?; - let input_field = input_expr.return_field(acc_args.schema)?; - - let double_output = input_field + let double_output = acc_args.expr_fields[0] .metadata() .get("modify_values") .map(|v| v == "double_output") diff --git a/datafusion/core/tests/user_defined/user_defined_plan.rs b/datafusion/core/tests/user_defined/user_defined_plan.rs index f0bf15d3483b..ffe0ba021edb 100644 --- a/datafusion/core/tests/user_defined/user_defined_plan.rs +++ b/datafusion/core/tests/user_defined/user_defined_plan.rs @@ -56,7 +56,6 @@ //! //! The same answer can be produced by simply keeping track of the top //! N elements, reducing the total amount of required buffer memory. -//! use std::fmt::Debug; use std::hash::Hash; diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs index fb1371da6ceb..3ca8f846aa5e 100644 --- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs @@ -1812,7 +1812,7 @@ async fn test_config_options_work_for_scalar_func() -> Result<()> { }); let mut config = SessionConfig::new(); - config.options_mut().execution.time_zone = "AEST".into(); + config.options_mut().execution.time_zone = Some("AEST".into()); let ctx = SessionContext::new_with_config(config); diff --git a/datafusion/datasource-arrow/Cargo.toml b/datafusion/datasource-arrow/Cargo.toml index b3d1e3f2accc..fbadc8708ca6 100644 --- a/datafusion/datasource-arrow/Cargo.toml +++ b/datafusion/datasource-arrow/Cargo.toml @@ -51,6 +51,9 @@ tokio = { workspace = true } [dev-dependencies] chrono = { workspace = true } +# Note: add additional linter rules in lib.rs. +# Rust does not support workspace + new linter rules in subcrates yet +# https://github.com/rust-lang/cargo/issues/13157 [lints] workspace = true diff --git a/datafusion/datasource-arrow/src/file_format.rs b/datafusion/datasource-arrow/src/file_format.rs index 3b8564080421..ef478e268890 100644 --- a/datafusion/datasource-arrow/src/file_format.rs +++ b/datafusion/datasource-arrow/src/file_format.rs @@ -20,15 +20,15 @@ //! Works with files following the [Arrow IPC format](https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format) use std::any::Any; -use std::borrow::Cow; use std::collections::HashMap; use std::fmt::{self, Debug}; +use std::io::{Seek, SeekFrom}; use std::sync::Arc; use arrow::datatypes::{Schema, SchemaRef}; use arrow::error::ArrowError; use arrow::ipc::convert::fb_to_schema; -use arrow::ipc::reader::FileReader; +use arrow::ipc::reader::{FileReader, StreamReader}; use arrow::ipc::writer::IpcWriteOptions; use arrow::ipc::{root_as_message, CompressionType}; use datafusion_common::error::Result; @@ -45,6 +45,7 @@ use datafusion_datasource::sink::{DataSink, DataSinkExec}; use datafusion_datasource::write::{ get_writer_schema, ObjectWriterBuilder, SharedBuffer, }; +use datafusion_datasource::TableSchema; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::dml::InsertOp; use datafusion_physical_expr_common::sort_expr::LexRequirement; @@ -61,7 +62,9 @@ use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; use datafusion_session::Session; use futures::stream::BoxStream; use futures::StreamExt; -use object_store::{GetResultPayload, ObjectMeta, ObjectStore}; +use object_store::{ + path::Path, GetOptions, GetRange, GetResultPayload, ObjectMeta, ObjectStore, +}; use tokio::io::AsyncWriteExt; /// Initial writing buffer size. Note this is just a size hint for efficiency. It @@ -71,8 +74,8 @@ const INITIAL_BUFFER_BYTES: usize = 1048576; /// If the buffered Arrow data exceeds this size, it is flushed to object store const BUFFER_FLUSH_BYTES: usize = 1024000; +/// Factory struct used to create [`ArrowFormat`] #[derive(Default, Debug)] -/// Factory struct used to create [ArrowFormat] pub struct ArrowFormatFactory; impl ArrowFormatFactory { @@ -107,7 +110,7 @@ impl GetExt for ArrowFormatFactory { } } -/// Arrow `FileFormat` implementation. +/// Arrow [`FileFormat`] implementation. #[derive(Default, Debug)] pub struct ArrowFormat; @@ -150,12 +153,23 @@ impl FileFormat for ArrowFormat { let schema = match r.payload { #[cfg(not(target_arch = "wasm32"))] GetResultPayload::File(mut file, _) => { - let reader = FileReader::try_new(&mut file, None)?; - reader.schema() - } - GetResultPayload::Stream(stream) => { - infer_schema_from_file_stream(stream).await? + match FileReader::try_new(&mut file, None) { + Ok(reader) => reader.schema(), + Err(file_error) => { + // not in the file format, but FileReader read some bytes + // while trying to parse the file and so we need to rewind + // it to the beginning of the file + file.seek(SeekFrom::Start(0))?; + match StreamReader::try_new(&mut file, None) { + Ok(reader) => reader.schema(), + Err(stream_error) => { + return Err(internal_datafusion_err!("Failed to parse Arrow file as either file format or stream format. File format error: {file_error}. Stream format error: {stream_error}")); + } + } + } + } } + GetResultPayload::Stream(stream) => infer_stream_schema(stream).await?, }; schemas.push(schema.as_ref().clone()); } @@ -175,10 +189,33 @@ impl FileFormat for ArrowFormat { async fn create_physical_plan( &self, - _state: &dyn Session, + state: &dyn Session, conf: FileScanConfig, ) -> Result> { - let source = Arc::new(ArrowSource::default()); + let object_store = state.runtime_env().object_store(&conf.object_store_url)?; + let object_location = &conf + .file_groups + .first() + .ok_or_else(|| internal_datafusion_err!("No files found in file group"))? + .files() + .first() + .ok_or_else(|| internal_datafusion_err!("No files found in file group"))? + .object_meta + .location; + + let table_schema = TableSchema::new( + Arc::clone(conf.file_schema()), + conf.table_partition_cols().clone(), + ); + + let source: Arc = + match is_object_in_arrow_ipc_file_format(object_store, object_location).await + { + Ok(true) => Arc::new(ArrowSource::new_file_source(table_schema)), + Ok(false) => Arc::new(ArrowSource::new_stream_file_source(table_schema)), + Err(e) => Err(e)?, + }; + let config = FileScanConfigBuilder::from(conf) .with_source(source) .build(); @@ -202,12 +239,12 @@ impl FileFormat for ArrowFormat { Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _) } - fn file_source(&self) -> Arc { - Arc::new(ArrowSource::default()) + fn file_source(&self, table_schema: TableSchema) -> Arc { + Arc::new(ArrowSource::new_file_source(table_schema)) } } -/// Implements [`FileSink`] for writing to arrow_ipc files +/// Implements [`FileSink`] for Arrow IPC files struct ArrowFileSink { config: FileSinkConfig, } @@ -344,94 +381,160 @@ impl DataSink for ArrowFileSink { } } +// Custom implementation of inferring schema. Should eventually be moved upstream to arrow-rs. +// See + const ARROW_MAGIC: [u8; 6] = [b'A', b'R', b'R', b'O', b'W', b'1']; const CONTINUATION_MARKER: [u8; 4] = [0xff; 4]; -/// Custom implementation of inferring schema. Should eventually be moved upstream to arrow-rs. -/// See -async fn infer_schema_from_file_stream( +async fn infer_stream_schema( mut stream: BoxStream<'static, object_store::Result>, ) -> Result { - // Expected format: - // - 6 bytes - // - 2 bytes - // - 4 bytes, not present below v0.15.0 - // - 4 bytes - // - // - - // So in first read we need at least all known sized sections, - // which is 6 + 2 + 4 + 4 = 16 bytes. - let bytes = collect_at_least_n_bytes(&mut stream, 16, None).await?; - - // Files should start with these magic bytes - if bytes[0..6] != ARROW_MAGIC { - return Err(ArrowError::ParseError( - "Arrow file does not contain correct header".to_string(), - ))?; - } - - // Since continuation marker bytes added in later versions - let (meta_len, rest_of_bytes_start_index) = if bytes[8..12] == CONTINUATION_MARKER { - (&bytes[12..16], 16) + // IPC streaming format. + // See https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format + // + // + // + // ... + // + // + // ... + // + // ... + // + // ... + // + // + + // The streaming format is made up of a sequence of encapsulated messages. + // See https://arrow.apache.org/docs/format/Columnar.html#encapsulated-message-format + // + // (added in v0.15.0) + // + // + // + // + // + // The first message is the schema. + + // IPC file format is a wrapper around the streaming format with indexing information. + // See https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format + // + // + // + // + //