From b8549b943ee2a677d455e27a5def4ec01a7a47f8 Mon Sep 17 00:00:00 2001 From: Abanoub Doss Date: Sun, 24 May 2026 16:24:07 -0500 Subject: [PATCH 1/7] build(python): add bytes dep and wire file_io module into pyiceberg_core_rust Add `bytes = "1"` to the Python binding's Cargo.toml (needed for explicit byte-slice conversion in file_io.rs) and register file_io::register_module in lib.rs, placing it alongside the existing transform/manifest registrations. --- bindings/python/Cargo.lock | 955 +++++++++++++++++++++++++++++++++++-- bindings/python/Cargo.toml | 2 + bindings/python/src/lib.rs | 2 + 3 files changed, 927 insertions(+), 32 deletions(-) diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index 5a08355c0b..65140da8d5 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -72,7 +72,7 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "cipher", "cpufeatures 0.2.17", ] @@ -97,7 +97,7 @@ version = "0.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "const-random", "getrandom 0.3.4", "once_cell", @@ -144,6 +144,56 @@ dependencies = [ "libc", ] +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.60.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.60.2", +] + [[package]] name = "anyhow" version = "1.0.102" @@ -175,6 +225,15 @@ dependencies = [ "zstd", ] +[[package]] +name = "approx" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" +dependencies = [ + "num-traits", +] + [[package]] name = "ar_archive_writer" version = "0.5.1" @@ -612,7 +671,7 @@ dependencies = [ "arrayref", "arrayvec", "cc", - "cfg-if", + "cfg-if 1.0.4", "constant_time_eq", "cpufeatures 0.3.0", ] @@ -700,6 +759,17 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "bstr" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +dependencies = [ + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "bumpalo" version = "3.20.2" @@ -754,6 +824,12 @@ dependencies = [ "shlex", ] +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + [[package]] name = "cfg-if" version = "1.0.4" @@ -766,6 +842,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "chacha20" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" +dependencies = [ + "cfg-if 1.0.4", + "cpufeatures 0.3.0", + "rand_core 0.10.1", +] + [[package]] name = "chrono" version = "0.4.44" @@ -800,6 +887,46 @@ dependencies = [ "inout", ] +[[package]] +name = "clap" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + [[package]] name = "cmake" version = "0.1.58" @@ -809,6 +936,21 @@ dependencies = [ "cc", ] +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "colored" +version = "3.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "combine" version = "4.6.7" @@ -885,6 +1027,12 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "const-str" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18f12cc9948ed9604230cdddc7c86e270f9401ccbe3c2e98a4378c5e7632212f" + [[package]] name = "const_panic" version = "0.2.15" @@ -900,6 +1048,16 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation" version = "0.10.1" @@ -931,6 +1089,15 @@ version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "533d38ecd2709b7608fb8e18e4504deb99e9a72879e6aa66373a76d8dc4259ea" +[[package]] +name = "countio" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9702aee5d1d744c01d82f6915644f950f898e014903385464c773b96fefdecb" +dependencies = [ + "futures-io", +] + [[package]] name = "cpufeatures" version = "0.2.17" @@ -964,7 +1131,7 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", ] [[package]] @@ -1129,7 +1296,7 @@ version = "6.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "crossbeam-utils", "hashbrown 0.14.5", "lock_api", @@ -1934,6 +2101,27 @@ dependencies = [ "subtle", ] +[[package]] +name = "dirs" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.60.2", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -2223,13 +2411,22 @@ dependencies = [ "slab", ] +[[package]] +name = "gearhash" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8cf82cf76cd16485e56295a1377c775ce708c9f1a0be6b029076d60a245d213" +dependencies = [ + "cfg-if 0.1.10", +] + [[package]] name = "generational-arena" version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877e94aff08e743b651baaea359664321055749b398adff8740a7399af7796e7" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", ] [[package]] @@ -2248,10 +2445,10 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "js-sys", "libc", - "wasi", + "wasi 0.11.1+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -2261,7 +2458,7 @@ version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "js-sys", "libc", "r-efi 5.3.0", @@ -2275,11 +2472,14 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", + "js-sys", "libc", "r-efi 6.0.0", + "rand_core 0.10.1", "wasip2", "wasip3", + "wasm-bindgen", ] [[package]] @@ -2292,6 +2492,26 @@ dependencies = [ "polyval", ] +[[package]] +name = "git-version" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad568aa3db0fcbc81f2f116137f263d7304f512a1209b35b85150d3ef88ad19" +dependencies = [ + "git-version-macro", +] + +[[package]] +name = "git-version-macro" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "glob" version = "0.3.3" @@ -2316,7 +2536,7 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "crunchy", "num-traits", "zerocopy", @@ -2360,6 +2580,12 @@ version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" +[[package]] +name = "heapify" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0049b265b7f201ca9ab25475b22b47fe444060126a51abe00f77d986fc5cc52e" + [[package]] name = "heck" version = "0.5.0" @@ -2372,6 +2598,28 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hf-xet" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "430b33fa84f92796d4d263070b6c0d3ca219df7b9a0e1853ee431029b1612bcd" +dependencies = [ + "async-trait", + "bytes", + "http", + "more-asserts", + "serde", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "uuid", + "xet-client", + "xet-core-structures", + "xet-data", + "xet-runtime", +] + [[package]] name = "hmac" version = "0.12.1" @@ -2479,9 +2727,11 @@ dependencies = [ "percent-encoding", "pin-project-lite", "socket2", + "system-configuration", "tokio", "tower-service", "tracing", + "windows-registry", ] [[package]] @@ -2584,7 +2834,7 @@ dependencies = [ "anyhow", "async-trait", "bytes", - "cfg-if", + "cfg-if 1.0.4", "futures", "iceberg", "opendal", @@ -2764,6 +3014,12 @@ version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + [[package]] name = "itertools" version = "0.13.0" @@ -2837,7 +3093,7 @@ version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5efd9a482cf3a427f00d6b35f14332adc7902ce91efb778580e180ff90fa3498" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "combine", "jni-macros", "jni-sys", @@ -2896,7 +3152,7 @@ version = "0.3.98" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67df7112613f8bfd9150013a0314e196f4800d3201ae742489d999db2f979f08" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "futures-util", "once_cell", "wasm-bindgen", @@ -2920,6 +3176,23 @@ dependencies = [ "zeroize", ] +[[package]] +name = "konst" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f660d5f887e3562f9ab6f4a14988795b694099d66b4f5dedc02d197ba9becb1d" +dependencies = [ + "const_panic", + "konst_proc_macros", + "typewit", +] + +[[package]] +name = "konst_proc_macros" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" + [[package]] name = "lazy_static" version = "1.5.0" @@ -3010,7 +3283,7 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "winapi", ] @@ -3040,6 +3313,15 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" +[[package]] +name = "libredox" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c" +dependencies = [ + "libc", +] + [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -3082,13 +3364,22 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + [[package]] name = "md-5" version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "digest", ] @@ -3124,7 +3415,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" dependencies = [ "libc", - "wasi", + "wasi 0.11.1+wasi-snapshot-preview1", "windows-sys 0.61.2", ] @@ -3148,12 +3439,36 @@ dependencies = [ "uuid", ] +[[package]] +name = "more-asserts" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fafa6961cabd9c63bcd77a45d7e3b7f3b552b70417831fb0f56db717e72407e" + [[package]] name = "murmur3" version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b" +[[package]] +name = "ntapi" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" +dependencies = [ + "winapi", +] + +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.60.2", +] + [[package]] name = "num-bigint" version = "0.4.6" @@ -3226,6 +3541,34 @@ dependencies = [ "libm", ] +[[package]] +name = "objc2-core-foundation" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" +dependencies = [ + "bitflags", +] + +[[package]] +name = "objc2-io-kit" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33fafba39597d6dc1fb709123dfa8289d39406734be322956a69f0931c73bb15" +dependencies = [ + "libc", + "objc2-core-foundation", +] + +[[package]] +name = "objc2-system-configuration" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7216bd11cbda54ccabcab84d523dc93b858ec75ecfb3a7d89513fa22464da396" +dependencies = [ + "objc2-core-foundation", +] + [[package]] name = "object" version = "0.37.3" @@ -3267,6 +3610,18 @@ version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "oneshot" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "269bca4c2591a28585d6bf10d9ed0332b7d76900a1b02bec41bdc3a2cdcda107" + [[package]] name = "opaque-debug" version = "0.3.1" @@ -3288,6 +3643,7 @@ dependencies = [ "opendal-service-azdls", "opendal-service-fs", "opendal-service-gcs", + "opendal-service-hf", "opendal-service-oss", "opendal-service-s3", ] @@ -3427,6 +3783,23 @@ dependencies = [ "tokio", ] +[[package]] +name = "opendal-service-hf" +version = "0.56.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2ab7a2a8a11dfe257ef4db5c0de798acbcd0d6429c37382dad2154bc06a388" +dependencies = [ + "bytes", + "hf-xet", + "http", + "log", + "opendal-core", + "percent-encoding", + "reqwest 0.13.3", + "serde", + "serde_json", +] + [[package]] name = "opendal-service-oss" version = "0.56.0" @@ -3471,6 +3844,12 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + [[package]] name = "ordered-float" version = "2.10.1" @@ -3500,12 +3879,21 @@ dependencies = [ ] [[package]] -name = "parking" -version = "2.2.1" +name = "os_str_bytes" +version = "6.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" - -[[package]] +checksum = "e2355d85b9a3786f481747ced0e0ff2ba35213a1f9bd406ed906554d7af805a1" +dependencies = [ + "memchr", +] + +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + +[[package]] name = "parking_lot" version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -3521,7 +3909,7 @@ version = "0.9.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "libc", "redox_syscall", "smallvec", @@ -3555,6 +3943,7 @@ dependencies = [ "num-traits", "object_store", "paste", + "ring", "seq-macro", "simdutf8", "snap", @@ -3635,6 +4024,26 @@ dependencies = [ "siphasher", ] +[[package]] +name = "pin-project" +version = "1.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2466b2336ed02bcdca6b294417127b90ec92038d1d5c4fbeac971a922e0e0924" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "pin-project-lite" version = "0.2.17" @@ -3691,7 +4100,7 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d1fe60d06143b2430aa532c94cfe9e29783047f06c0d7fd359a9a51b729fa25" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "cpufeatures 0.2.17", "opaque-debug", "universal-hash", @@ -3793,11 +4202,13 @@ name = "pyiceberg_core_rust" version = "0.9.0" dependencies = [ "arrow", + "bytes", "datafusion-ffi", "iceberg", "iceberg-datafusion", "iceberg-storage-opendal", "pyo3", + "serde_json", "tokio", ] @@ -3982,6 +4393,17 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" +dependencies = [ + "chacha20", + "getrandom 0.4.2", + "rand_core 0.10.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -4020,6 +4442,12 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_core" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" + [[package]] name = "recursive" version = "0.1.1" @@ -4040,6 +4468,15 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "redb" +version = "3.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ba239c1c1693315d3cc0e601db3b3965543afbf48c41730fdca2f069f510f4a" +dependencies = [ + "libc", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -4049,6 +4486,17 @@ dependencies = [ "bitflags", ] +[[package]] +name = "redox_users" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" +dependencies = [ + "getrandom 0.2.17", + "libredox", + "thiserror", +] + [[package]] name = "ref-cast" version = "1.0.25" @@ -4282,6 +4730,8 @@ dependencies = [ "rustls", "rustls-pki-types", "rustls-platform-verifier", + "serde", + "serde_json", "sync_wrapper", "tokio", "tokio-rustls", @@ -4296,6 +4746,20 @@ dependencies = [ "web-sys", ] +[[package]] +name = "reqwest-middleware" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bc3f1384cffa4f274dad2d4ddd73aed32fed8f786d96c6be8aa4e5fd3c3b58" +dependencies = [ + "anyhow", + "async-trait", + "http", + "reqwest 0.13.3", + "thiserror", + "tower-service", +] + [[package]] name = "ring" version = "0.17.14" @@ -4303,7 +4767,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" dependencies = [ "cc", - "cfg-if", + "cfg-if 1.0.4", "getrandom 0.2.17", "libc", "untrusted 0.9.0", @@ -4347,7 +4811,7 @@ version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "796e8d2b6696392a43bea58116b667fb4c29727dc5abd27d6acf338bb4f688c7" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "ordered-multimap", ] @@ -4421,7 +4885,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26d1e2536ce4f35f4846aa13bff16bd0ff40157cdb14cc056c7b14ba41233ba0" dependencies = [ - "core-foundation", + "core-foundation 0.10.1", "core-foundation-sys", "jni", "log", @@ -4466,6 +4930,12 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +[[package]] +name = "safe-transmute" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3944826ff8fa8093089aba3acb4ef44b9446a99a16f3bf4e74af3f77d340ab7d" + [[package]] name = "salsa20" version = "0.10.2" @@ -4541,7 +5011,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ "bitflags", - "core-foundation", + "core-foundation 0.10.1", "core-foundation-sys", "libc", "security-framework-sys", @@ -4692,7 +5162,7 @@ version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "cpufeatures 0.2.17", "digest", ] @@ -4703,9 +5173,39 @@ version = "0.10.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "cpufeatures 0.2.17", "digest", + "sha2-asm", +] + +[[package]] +name = "sha2-asm" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b845214d6175804686b2bd482bcffe96651bb2d1200742b712003504a2dac1ab" +dependencies = [ + "cc", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shellexpand" +version = "3.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32824fab5e16e6c4d86dc1ba84489390419a39f97699852b66480bb87d297ed8" +dependencies = [ + "bstr", + "dirs", + "os_str_bytes", ] [[package]] @@ -4843,12 +5343,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "640c8cdd92b6b12f5bcb1803ca3bbf5ab96e5e6b6b96b9ab77dabe9e880b3190" dependencies = [ "cc", - "cfg-if", + "cfg-if 1.0.4", "libc", "psm", "windows-sys 0.61.2", ] +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "statrs" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a3fe7c28c6512e766b0874335db33c94ad7b8f9054228ae1c2abd47ce7d335e" +dependencies = [ + "approx", + "num-traits", +] + [[package]] name = "strsim" version = "0.11.1" @@ -4882,6 +5398,12 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "symlink" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7973cce6668464ea31f176d85b13c7ab3bba2cb3b77a2ed26abd7801688010a" + [[package]] name = "syn" version = "1.0.109" @@ -4924,6 +5446,41 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "sysinfo" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92ab6a2f8bfe508deb3c6406578252e491d299cbbf3bc0529ecc3313aee4a52f" +dependencies = [ + "libc", + "memchr", + "ntapi", + "objc2-core-foundation", + "objc2-io-kit", + "windows", +] + +[[package]] +name = "system-configuration" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" +dependencies = [ + "bitflags", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tagptr" version = "0.2.0" @@ -4969,6 +5526,15 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if 1.0.4", +] + [[package]] name = "thrift" version = "0.17.0" @@ -5071,6 +5637,17 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "tokio-retry" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40f644c762e9d396831ae2f8935c954b0d758c4532e924bead0f666d0c1c8640" +dependencies = [ + "pin-project-lite", + "rand 0.10.1", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.26.4" @@ -5162,6 +5739,19 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-appender" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "050686193eb999b4bb3bc2acfa891a13da00f79734704c4b8b4ef1a10b368a3c" +dependencies = [ + "crossbeam-channel", + "symlink", + "thiserror", + "time", + "tracing-subscriber", +] + [[package]] name = "tracing-attributes" version = "0.1.31" @@ -5180,6 +5770,49 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" dependencies = [ "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-serde" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "serde", + "serde_json", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", + "tracing-serde", ] [[package]] @@ -5335,12 +5968,24 @@ dependencies = [ "serde", ] +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + [[package]] name = "utf8_iter" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "uuid" version = "1.23.1" @@ -5353,6 +5998,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + [[package]] name = "version_check" version = "0.9.5" @@ -5384,6 +6035,15 @@ version = "0.11.1+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" +[[package]] +name = "wasi" +version = "0.14.7+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c" +dependencies = [ + "wasip2", +] + [[package]] name = "wasip2" version = "1.0.3+wasi-0.2.9" @@ -5402,13 +6062,22 @@ dependencies = [ "wit-bindgen 0.51.0", ] +[[package]] +name = "wasite" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66fe902b4a6b8028a753d5424909b764ccf79b7a209eac9bf97e59cda9f71a42" +dependencies = [ + "wasi 0.14.7+wasi-0.2.4", +] + [[package]] name = "wasm-bindgen" version = "0.2.121" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49ace1d07c165b0864824eee619580c4689389afa9dc9ed3a4c75040d82e6790" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "once_cell", "rustversion", "wasm-bindgen-macro", @@ -5533,6 +6202,19 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "whoami" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "998767ef88740d1f5b0682a9c53c24431453923962269c2db68ee43788c5a40d" +dependencies = [ + "libc", + "libredox", + "objc2-system-configuration", + "wasite", + "web-sys", +] + [[package]] name = "winapi" version = "0.3.9" @@ -5564,6 +6246,27 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "527fadee13e0c05939a6a05d5bd6eec6cd2e3dbd648b9f8e447c6518133d8580" +dependencies = [ + "windows-collections", + "windows-core", + "windows-future", + "windows-numerics", +] + +[[package]] +name = "windows-collections" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23b2d95af1a8a14a3c7367e1ed4fc9c20e0a26e79551b1454d72583c97cc6610" +dependencies = [ + "windows-core", +] + [[package]] name = "windows-core" version = "0.62.2" @@ -5577,6 +6280,17 @@ dependencies = [ "windows-strings", ] +[[package]] +name = "windows-future" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d6f90251fe18a279739e78025bd6ddc52a7e22f921070ccdc67dde84c605cb" +dependencies = [ + "windows-core", + "windows-link", + "windows-threading", +] + [[package]] name = "windows-implement" version = "0.60.2" @@ -5605,6 +6319,27 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-numerics" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e2e40844ac143cdb44aead537bbf727de9b044e107a0f1220392177d15b0f26" +dependencies = [ + "windows-core", + "windows-link", +] + +[[package]] +name = "windows-registry" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" +dependencies = [ + "windows-link", + "windows-result", + "windows-strings", +] + [[package]] name = "windows-result" version = "0.4.1" @@ -5683,6 +6418,15 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] +[[package]] +name = "windows-threading" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3949bd5b99cafdf1c7ca86b43ca564028dfe27d66958f2470940f73d86d75b37" +dependencies = [ + "windows-link", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" @@ -5889,6 +6633,153 @@ dependencies = [ "rustix", ] +[[package]] +name = "xet-client" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e1e496dcbe6a09017acdfaf48e1a646735e7ff5b2a49e2c7e081cca77a59bc8" +dependencies = [ + "anyhow", + "async-trait", + "base64", + "bytes", + "clap", + "crc32fast", + "futures", + "http", + "hyper", + "lazy_static", + "more-asserts", + "rand 0.10.1", + "redb", + "reqwest 0.13.3", + "reqwest-middleware", + "serde", + "serde_json", + "serde_repr", + "statrs", + "tempfile", + "thiserror", + "tokio", + "tokio-retry", + "tracing", + "tracing-subscriber", + "url", + "urlencoding", + "web-time", + "xet-core-structures", + "xet-runtime", +] + +[[package]] +name = "xet-core-structures" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb838aa8eb67d730af301584cf003caad407487606058292a6750711b603fbee" +dependencies = [ + "async-trait", + "base64", + "blake3", + "bytemuck", + "bytes", + "clap", + "countio", + "csv", + "futures", + "futures-util", + "getrandom 0.4.2", + "heapify", + "itertools 0.14.0", + "lazy_static", + "lz4_flex", + "more-asserts", + "rand 0.10.1", + "regex", + "safe-transmute", + "serde", + "static_assertions", + "tempfile", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "uuid", + "web-time", + "xet-runtime", +] + +[[package]] +name = "xet-data" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67fd409bef621411a9d9013798540bb8036cb2678f03ab39af89a5e88034ed8c" +dependencies = [ + "anyhow", + "async-trait", + "bytes", + "chrono", + "clap", + "gearhash", + "http", + "itertools 0.14.0", + "lazy_static", + "more-asserts", + "rand 0.10.1", + "serde", + "serde_json", + "sha2", + "tempfile", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "url", + "uuid", + "walkdir", + "xet-client", + "xet-core-structures", + "xet-runtime", +] + +[[package]] +name = "xet-runtime" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15d8f121c33866f7648b737abe70d0e2dd9c0af4ffdd7219207531d0283aa63d" +dependencies = [ + "anyhow", + "async-trait", + "bytes", + "chrono", + "colored", + "const-str", + "ctor", + "dirs", + "futures", + "git-version", + "humantime", + "konst", + "lazy_static", + "libc", + "more-asserts", + "oneshot", + "pin-project", + "rand 0.10.1", + "reqwest 0.13.3", + "serde", + "serde_json", + "shellexpand", + "sysinfo", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "tracing-appender", + "tracing-subscriber", + "whoami", + "winapi", +] + [[package]] name = "yoke" version = "0.8.2" diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 251f96d169..950fc1b07f 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -32,11 +32,13 @@ crate-type = ["cdylib"] [dependencies] arrow = { version = "58", features = ["pyarrow", "chrono-tz"] } +bytes = "1" iceberg = { path = "../../crates/iceberg" } iceberg-storage-opendal = { path = "../../crates/storage/opendal", features = ["opendal-all"] } pyo3 = { version = "0.28", features = ["extension-module", "abi3-py310"] } iceberg-datafusion = { path = "../../crates/integrations/datafusion" } datafusion-ffi = "53.0.0" +serde_json = "1" tokio = { version = "1.46.1", default-features = false } [profile.release] diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index 131d864bdd..235716e5a2 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -20,6 +20,7 @@ use pyo3::prelude::*; mod data_file; mod datafusion_table_provider; mod error; +mod file_io; mod manifest; mod runtime; mod transform; @@ -29,5 +30,6 @@ fn pyiceberg_core_rust(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> datafusion_table_provider::register_module(py, m)?; transform::register_module(py, m)?; manifest::register_module(py, m)?; + file_io::register_module(py, m)?; Ok(()) } From f370d59f68b46008cf7640a4656a10bd25b09d5d Mon Sep 17 00:00:00 2001 From: Abanoub Doss Date: Sun, 24 May 2026 16:24:40 -0500 Subject: [PATCH 2/7] feat(python): PyFileIO, PyInputFile, PyOutputFile binding for iceberg-rust FileIO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Exposes iceberg-rust's `FileIO` to Python via three pyclasses: - `FileIO.from_props(dict)` — primary constructor matching the same OpenDalResolvingStorageFactory plumbing already used by IcebergDataFusionTable, now returning a reusable handle instead of discarding after construction. Callers amortize setup across thousands of file opens in a single query. - `FileIO.exists(path)` / `FileIO.delete(path)` — async ops via the shared Tokio runtime handle. - `FileIO.new_input(path)` / `FileIO.new_output(path)` — sync (InputFile/OutputFile hold the storage Arc internally). - `InputFile.read()` → `bytes`, `InputFile.exists()`, `InputFile.metadata()` → dict. - `OutputFile.write(bytes)` — one-shot write. - `__repr__` on FileIO redacts any key containing secret/key/token/password/credential/passphrase. --- bindings/python/src/file_io.rs | 209 +++++++++++++++++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 bindings/python/src/file_io.rs diff --git a/bindings/python/src/file_io.rs b/bindings/python/src/file_io.rs new file mode 100644 index 0000000000..ba7537a3a6 --- /dev/null +++ b/bindings/python/src/file_io.rs @@ -0,0 +1,209 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use bytes::Bytes; +use iceberg::io::{FileIO, FileIOBuilder, InputFile, OutputFile}; +use iceberg_storage_opendal::OpenDalResolvingStorageFactory; +use pyo3::exceptions::PyIOError; +use pyo3::prelude::*; +use pyo3::types::PyDict; + +use crate::runtime::runtime; + +/// Keys whose values must be redacted in __repr__ to avoid leaking credentials. +fn is_sensitive_key(key: &str) -> bool { + let lower = key.to_lowercase(); + lower.contains("secret") + || lower.contains("key") + || lower.contains("token") + || lower.contains("password") + || lower.contains("credential") + || lower.contains("passphrase") +} + +#[pyclass(name = "FileIO", module = "pyiceberg_core.file_io", from_py_object)] +#[derive(Clone)] +pub struct PyFileIO { + inner: FileIO, + /// A copy of the original props used at construction, for __repr__. + props: HashMap, +} + +#[pymethods] +impl PyFileIO { + /// Construct a `FileIO` handle from a dict of storage properties. + /// + /// The property keys are the same ones iceberg-rust's `FileIOBuilder` recognizes + /// (e.g. `s3.region`, `s3.access-key-id`). For local-filesystem paths use + /// `file://…` URIs with an empty dict. + /// + /// The `FileIO` instance is lazily initialized on first use and cached, so + /// constructing once and reusing across many file opens amortizes the setup cost. + #[staticmethod] + fn from_props(props: HashMap) -> PyResult { + let factory = Arc::new(OpenDalResolvingStorageFactory::new()); + let file_io = FileIOBuilder::new(factory) + .with_props(props.clone()) + .build(); + Ok(PyFileIO { + inner: file_io, + props, + }) + } + + /// Check whether a file exists at the given path. + fn exists(&self, path: String) -> PyResult { + runtime() + .block_on(self.inner.exists(&path)) + .map_err(|e| PyIOError::new_err(e.to_string())) + } + + /// Delete the file at the given path. + fn delete(&self, path: String) -> PyResult<()> { + runtime() + .block_on(self.inner.delete(&path)) + .map_err(|e| PyIOError::new_err(e.to_string())) + } + + /// Open the file at `path` for reading and return a `InputFile` handle. + fn new_input(&self, path: String) -> PyResult { + let input = self + .inner + .new_input(&path) + .map_err(|e| PyIOError::new_err(e.to_string()))?; + Ok(PyInputFile { inner: input }) + } + + /// Open the file at `path` for writing and return an `OutputFile` handle. + fn new_output(&self, path: String) -> PyResult { + let output = self + .inner + .new_output(&path) + .map_err(|e| PyIOError::new_err(e.to_string()))?; + Ok(PyOutputFile { inner: output }) + } + + fn __repr__(&self) -> String { + // Build a summary of the props, redacting sensitive values. + let mut pairs: Vec = self + .props + .iter() + .map(|(k, v)| { + let display = if is_sensitive_key(k) { + "".to_string() + } else { + v.clone() + }; + format!("{k}={display}") + }) + .collect(); + pairs.sort(); // deterministic output + if pairs.is_empty() { + "FileIO()".to_string() + } else { + format!("FileIO({})", pairs.join(", ")) + } + } +} + +/// A handle for reading a single file. +/// +/// Obtained via `FileIO.new_input(path)`. +#[pyclass(name = "InputFile", module = "pyiceberg_core.file_io")] +pub struct PyInputFile { + inner: InputFile, +} + +#[pymethods] +impl PyInputFile { + /// The absolute path this input file was opened on. + fn location(&self) -> &str { + self.inner.location() + } + + /// Return `True` if the file exists in the underlying storage. + fn exists(&self) -> PyResult { + runtime() + .block_on(self.inner.exists()) + .map_err(|e| PyIOError::new_err(e.to_string())) + } + + /// Read the entire file content and return it as `bytes`. + fn read(&self) -> PyResult> { + let bytes = runtime() + .block_on(self.inner.read()) + .map_err(|e| PyIOError::new_err(e.to_string()))?; + Ok(bytes.to_vec()) + } + + /// Return a dict with file metadata. Currently exposes `size` (bytes). + fn metadata<'py>(&self, py: Python<'py>) -> PyResult> { + let meta = runtime() + .block_on(self.inner.metadata()) + .map_err(|e| PyIOError::new_err(e.to_string()))?; + let d = PyDict::new(py); + d.set_item("size", meta.size)?; + Ok(d) + } + + fn __repr__(&self) -> String { + format!("InputFile({:?})", self.inner.location()) + } +} + +/// A handle for writing a single file. +/// +/// Obtained via `FileIO.new_output(path)`. +#[pyclass(name = "OutputFile", module = "pyiceberg_core.file_io")] +pub struct PyOutputFile { + inner: OutputFile, +} + +#[pymethods] +impl PyOutputFile { + /// The absolute path this output file was opened on. + fn location(&self) -> &str { + self.inner.location() + } + + /// Write `data` to the file, replacing any existing content. + fn write(&self, data: &[u8]) -> PyResult<()> { + let bs = Bytes::copy_from_slice(data); + runtime() + .block_on(self.inner.write(bs)) + .map_err(|e| PyIOError::new_err(e.to_string())) + } + + fn __repr__(&self) -> String { + format!("OutputFile({:?})", self.inner.location()) + } +} + +pub fn register_module(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { + let this = PyModule::new(py, "file_io")?; + this.add_class::()?; + this.add_class::()?; + this.add_class::()?; + m.add_submodule(&this)?; + py.import("sys")? + .getattr("modules")? + .set_item("pyiceberg_core.file_io", this)?; + Ok(()) +} From 3e9595a40d56d5e1886cf4d7dc4bdd013f01c890 Mon Sep 17 00:00:00 2001 From: Abanoub Doss Date: Sun, 24 May 2026 16:25:06 -0500 Subject: [PATCH 3/7] feat(python): add file_io.pyi type stubs Bare signatures for FileIO, InputFile, and OutputFile with a module-level docstring explaining from_props(dict) as the primary constructor and the credential-redaction behaviour of __repr__. --- .../python/python/pyiceberg_core/file_io.pyi | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 bindings/python/python/pyiceberg_core/file_io.pyi diff --git a/bindings/python/python/pyiceberg_core/file_io.pyi b/bindings/python/python/pyiceberg_core/file_io.pyi new file mode 100644 index 0000000000..945d55bf4c --- /dev/null +++ b/bindings/python/python/pyiceberg_core/file_io.pyi @@ -0,0 +1,115 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Type stubs for the iceberg-rust FileIO binding. + +Primary constructor: ``FileIO.from_props(props)`` where ``props`` is a plain +``dict[str, str]`` of storage-backend configuration properties (the same keys +iceberg-rust's ``FileIOBuilder`` recognises, e.g. ``s3.region``, +``s3.access-key-id``). For local-filesystem paths use ``file://…`` URIs with +an empty dict. + +The ``FileIO`` instance is lazily initialised on first use and cached, so +constructing once and reusing across many file opens amortises the storage +setup cost. + +Credential security: ``repr(FileIO(...))`` redacts any property key that +contains ``secret``, ``key``, ``token``, ``password``, ``credential``, or +``passphrase`` (case-insensitive). +""" + +class FileIO: + """Reusable handle to iceberg-rust's ``iceberg::io::FileIO``. + + Backed by ``OpenDalResolvingStorageFactory``, which auto-detects the + storage scheme from the path (``file://``, ``s3://``, ``gs://``, + ``abfss://``, …) and caches the per-scheme operator on first use. + """ + + @staticmethod + def from_props(props: dict[str, str]) -> "FileIO": + """Construct a ``FileIO`` handle from a dict of storage properties.""" + ... + + def exists(self, path: str) -> bool: + """Return ``True`` if the file at ``path`` exists in the underlying storage.""" + ... + + def delete(self, path: str) -> None: + """Delete the file at ``path``. + + Raises ``IOError`` on failure. Deleting a non-existent file is a no-op. + """ + ... + + def new_input(self, path: str) -> "InputFile": + """Open ``path`` for reading and return an ``InputFile`` handle. + + Raises ``IOError`` if the path cannot be resolved. + """ + ... + + def new_output(self, path: str) -> "OutputFile": + """Open ``path`` for writing and return an ``OutputFile`` handle. + + Raises ``IOError`` if the path cannot be resolved. + """ + ... + + def __repr__(self) -> str: ... + + +class InputFile: + """Handle for reading a single file. Obtained via ``FileIO.new_input(path)``.""" + + def location(self) -> str: + """The absolute path this input file was opened on.""" + ... + + def exists(self) -> bool: + """Return ``True`` if the file exists in the underlying storage.""" + ... + + def read(self) -> bytes: + """Read the entire file content and return it as ``bytes``. + + Raises ``IOError`` on read failure. + """ + ... + + def metadata(self) -> dict[str, int]: + """Return file metadata. Currently exposes ``size`` (bytes as int).""" + ... + + def __repr__(self) -> str: ... + + +class OutputFile: + """Handle for writing a single file. Obtained via ``FileIO.new_output(path)``.""" + + def location(self) -> str: + """The absolute path this output file was opened on.""" + ... + + def write(self, data: bytes) -> None: + """Write ``data`` to the file, replacing any existing content. + + Raises ``IOError`` on write failure. + """ + ... + + def __repr__(self) -> str: ... From 28dd818b27eb7e79ef43dfb4aaff57afb326e6f0 Mon Sep 17 00:00:00 2001 From: Abanoub Doss Date: Sun, 24 May 2026 16:25:43 -0500 Subject: [PATCH 4/7] test(python): pytest suite for PyFileIO / PyInputFile / PyOutputFile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 30 tests covering: - from_props construction (empty dict, partial props, handle independence) - __repr__ credential redaction for 7 sensitive key patterns - exists/delete via FileIO - OutputFile.write (create, overwrite, empty bytes) - InputFile.exists, read, metadata - round-trip write→read - repr format for InputFile and OutputFile All tests use tmp_path for filesystem isolation; no network deps. --- bindings/python/tests/test_file_io.py | 264 ++++++++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 bindings/python/tests/test_file_io.py diff --git a/bindings/python/tests/test_file_io.py b/bindings/python/tests/test_file_io.py new file mode 100644 index 0000000000..bf9858e44f --- /dev/null +++ b/bindings/python/tests/test_file_io.py @@ -0,0 +1,264 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""pytest tests for the PyFileIO / PyInputFile / PyOutputFile binding. + +All tests use the local filesystem via ``file://`` URIs and ``tmp_path`` for +isolation; no network dependencies. +""" + +import pytest + +from pyiceberg_core.file_io import FileIO, InputFile, OutputFile + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def local_fio() -> FileIO: + """FileIO handle backed by local filesystem (empty props, file:// URIs).""" + return FileIO.from_props({}) + + +def file_uri(path) -> str: + """Convert a pathlib.Path to an absolute file:// URI.""" + return f"file://{path}" + + +# --------------------------------------------------------------------------- +# Construction +# --------------------------------------------------------------------------- + + +def test_from_props_empty_dict(): + """Constructing with an empty dict must succeed without raising.""" + fio = FileIO.from_props({}) + assert isinstance(fio, FileIO) + + +def test_from_props_with_region_only(): + """Supplying only a region property must not raise at construction time. + + FileIO is lazily initialised; actual S3 credential errors only surface on + first I/O — not at from_props(). + """ + fio = FileIO.from_props({"s3.region": "us-east-1"}) + assert isinstance(fio, FileIO) + + +def test_from_props_returns_independent_handles(): + """Two calls to from_props must return distinct objects.""" + a = FileIO.from_props({}) + b = FileIO.from_props({}) + assert a is not b + + +# --------------------------------------------------------------------------- +# __repr__ — credential redaction +# --------------------------------------------------------------------------- + +CREDENTIAL_CASES = [ + ("s3.access-key-id", "AKIAIOSFODNN7EXAMPLE"), + ("s3.secret-access-key", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"), + ("gcs.service.account.private.key", "BEGIN PRIVATE KEY"), + ("token", "mytoken123"), + ("password", "hunter2"), + ("credential", "cred123"), + ("passphrase", "ssh-passphrase"), +] + + +@pytest.mark.parametrize("key,value", CREDENTIAL_CASES) +def test_repr_redacts_sensitive_key(key, value): + fio = FileIO.from_props({key: value}) + r = repr(fio) + assert value not in r, f"repr leaked value for key {key!r}: {r!r}" + assert "" in r + + +def test_repr_shows_non_sensitive_values(): + fio = FileIO.from_props({"s3.region": "us-east-1", "warehouse": "s3://mybucket"}) + r = repr(fio) + assert "us-east-1" in r + assert "s3://mybucket" in r + + +def test_repr_empty_props(): + fio = FileIO.from_props({}) + assert repr(fio) == "FileIO()" + + +def test_repr_mixed_sensitive_and_plain(): + fio = FileIO.from_props( + { + "s3.region": "eu-west-1", + "s3.access-key-id": "SECRET", + "warehouse": "s3://bucket", + } + ) + r = repr(fio) + assert "SECRET" not in r + assert "eu-west-1" in r + assert "s3://bucket" in r + + +# --------------------------------------------------------------------------- +# exists / delete via FileIO +# --------------------------------------------------------------------------- + + +def test_exists_returns_false_for_missing_file(tmp_path): + fio = local_fio() + uri = file_uri(tmp_path / "nonexistent.txt") + assert fio.exists(uri) is False + + +def test_exists_returns_true_after_write(tmp_path): + fio = local_fio() + uri = file_uri(tmp_path / "hello.txt") + fio.new_output(uri).write(b"data") + assert fio.exists(uri) is True + + +def test_delete_removes_file(tmp_path): + fio = local_fio() + uri = file_uri(tmp_path / "to_delete.txt") + fio.new_output(uri).write(b"bye") + assert fio.exists(uri) is True + fio.delete(uri) + assert fio.exists(uri) is False + + +def test_delete_nonexistent_file_is_noop(tmp_path): + """Deleting a file that does not exist must not raise.""" + fio = local_fio() + uri = file_uri(tmp_path / "ghost.txt") + fio.delete(uri) # should not raise + + +# --------------------------------------------------------------------------- +# new_output / OutputFile.write +# --------------------------------------------------------------------------- + + +def test_new_output_returns_output_file(tmp_path): + fio = local_fio() + uri = file_uri(tmp_path / "out.txt") + out = fio.new_output(uri) + assert isinstance(out, OutputFile) + + +def test_output_file_location(tmp_path): + fio = local_fio() + uri = file_uri(tmp_path / "loc.txt") + out = fio.new_output(uri) + assert out.location() == uri + + +def test_output_write_creates_file(tmp_path): + fio = local_fio() + path = tmp_path / "created.txt" + fio.new_output(file_uri(path)).write(b"hello") + assert path.exists() + assert path.read_bytes() == b"hello" + + +def test_output_write_overwrites_existing(tmp_path): + fio = local_fio() + path = tmp_path / "overwrite.txt" + uri = file_uri(path) + fio.new_output(uri).write(b"first") + fio.new_output(uri).write(b"second") + assert path.read_bytes() == b"second" + + +def test_output_write_empty_bytes(tmp_path): + fio = local_fio() + path = tmp_path / "empty.txt" + fio.new_output(file_uri(path)).write(b"") + assert path.read_bytes() == b"" + + +# --------------------------------------------------------------------------- +# new_input / InputFile.read / exists / metadata +# --------------------------------------------------------------------------- + + +def test_new_input_returns_input_file(tmp_path): + fio = local_fio() + path = tmp_path / "read.txt" + path.write_bytes(b"content") + inp = fio.new_input(file_uri(path)) + assert isinstance(inp, InputFile) + + +def test_input_file_location(tmp_path): + fio = local_fio() + path = tmp_path / "loc.txt" + path.write_bytes(b"x") + uri = file_uri(path) + assert fio.new_input(uri).location() == uri + + +def test_input_exists_true(tmp_path): + fio = local_fio() + path = tmp_path / "present.txt" + path.write_bytes(b"here") + assert fio.new_input(file_uri(path)).exists() is True + + +def test_input_exists_false(tmp_path): + fio = local_fio() + uri = file_uri(tmp_path / "absent.txt") + assert fio.new_input(uri).exists() is False + + +def test_round_trip_write_then_read(tmp_path): + """Write via OutputFile; read back via InputFile; bytes must match.""" + fio = local_fio() + uri = file_uri(tmp_path / "round.bin") + payload = b"\x00\x01\x02iceberg\xff" + fio.new_output(uri).write(payload) + assert fio.new_input(uri).read() == payload + + +def test_input_metadata_size(tmp_path): + fio = local_fio() + path = tmp_path / "meta.txt" + content = b"size-check" + path.write_bytes(content) + meta = fio.new_input(file_uri(path)).metadata() + assert isinstance(meta, dict) + assert meta["size"] == len(content) + + +def test_input_repr(tmp_path): + fio = local_fio() + uri = file_uri(tmp_path / "repr.txt") + r = repr(fio.new_input(uri)) + assert "InputFile" in r + assert "repr.txt" in r + + +def test_output_repr(tmp_path): + fio = local_fio() + uri = file_uri(tmp_path / "repr.txt") + r = repr(fio.new_output(uri)) + assert "OutputFile" in r + assert "repr.txt" in r From 43d457168022ce928cb1ba624df5979d86d19150 Mon Sep 17 00:00:00 2001 From: Abanoub Doss Date: Sun, 24 May 2026 17:00:16 -0500 Subject: [PATCH 5/7] refactor(python): tighten FileIO binding PR --- bindings/python/Cargo.lock | 2 - bindings/python/Cargo.toml | 2 - .../python/python/pyiceberg_core/file_io.pyi | 100 +------ .../python/python/pyiceberg_core/py.typed | 1 + bindings/python/src/file_io.rs | 37 +-- bindings/python/tests/test_file_io.py | 270 +++++------------- 6 files changed, 93 insertions(+), 319 deletions(-) create mode 100644 bindings/python/python/pyiceberg_core/py.typed diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index 65140da8d5..457242633f 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -4202,13 +4202,11 @@ name = "pyiceberg_core_rust" version = "0.9.0" dependencies = [ "arrow", - "bytes", "datafusion-ffi", "iceberg", "iceberg-datafusion", "iceberg-storage-opendal", "pyo3", - "serde_json", "tokio", ] diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 950fc1b07f..251f96d169 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -32,13 +32,11 @@ crate-type = ["cdylib"] [dependencies] arrow = { version = "58", features = ["pyarrow", "chrono-tz"] } -bytes = "1" iceberg = { path = "../../crates/iceberg" } iceberg-storage-opendal = { path = "../../crates/storage/opendal", features = ["opendal-all"] } pyo3 = { version = "0.28", features = ["extension-module", "abi3-py310"] } iceberg-datafusion = { path = "../../crates/integrations/datafusion" } datafusion-ffi = "53.0.0" -serde_json = "1" tokio = { version = "1.46.1", default-features = false } [profile.release] diff --git a/bindings/python/python/pyiceberg_core/file_io.pyi b/bindings/python/python/pyiceberg_core/file_io.pyi index 945d55bf4c..80b90c32e1 100644 --- a/bindings/python/python/pyiceberg_core/file_io.pyi +++ b/bindings/python/python/pyiceberg_core/file_io.pyi @@ -15,101 +15,23 @@ # specific language governing permissions and limitations # under the License. -"""Type stubs for the iceberg-rust FileIO binding. - -Primary constructor: ``FileIO.from_props(props)`` where ``props`` is a plain -``dict[str, str]`` of storage-backend configuration properties (the same keys -iceberg-rust's ``FileIOBuilder`` recognises, e.g. ``s3.region``, -``s3.access-key-id``). For local-filesystem paths use ``file://…`` URIs with -an empty dict. - -The ``FileIO`` instance is lazily initialised on first use and cached, so -constructing once and reusing across many file opens amortises the storage -setup cost. - -Credential security: ``repr(FileIO(...))`` redacts any property key that -contains ``secret``, ``key``, ``token``, ``password``, ``credential``, or -``passphrase`` (case-insensitive). -""" - class FileIO: - """Reusable handle to iceberg-rust's ``iceberg::io::FileIO``. - - Backed by ``OpenDalResolvingStorageFactory``, which auto-detects the - storage scheme from the path (``file://``, ``s3://``, ``gs://``, - ``abfss://``, …) and caches the per-scheme operator on first use. - """ - @staticmethod - def from_props(props: dict[str, str]) -> "FileIO": - """Construct a ``FileIO`` handle from a dict of storage properties.""" - ... - - def exists(self, path: str) -> bool: - """Return ``True`` if the file at ``path`` exists in the underlying storage.""" - ... - - def delete(self, path: str) -> None: - """Delete the file at ``path``. - - Raises ``IOError`` on failure. Deleting a non-existent file is a no-op. - """ - ... - - def new_input(self, path: str) -> "InputFile": - """Open ``path`` for reading and return an ``InputFile`` handle. - - Raises ``IOError`` if the path cannot be resolved. - """ - ... - - def new_output(self, path: str) -> "OutputFile": - """Open ``path`` for writing and return an ``OutputFile`` handle. - - Raises ``IOError`` if the path cannot be resolved. - """ - ... - + def from_props(props: dict[str, str]) -> "FileIO": ... + def exists(self, path: str) -> bool: ... + def delete(self, path: str) -> None: ... + def new_input(self, path: str) -> "InputFile": ... + def new_output(self, path: str) -> "OutputFile": ... def __repr__(self) -> str: ... - class InputFile: - """Handle for reading a single file. Obtained via ``FileIO.new_input(path)``.""" - - def location(self) -> str: - """The absolute path this input file was opened on.""" - ... - - def exists(self) -> bool: - """Return ``True`` if the file exists in the underlying storage.""" - ... - - def read(self) -> bytes: - """Read the entire file content and return it as ``bytes``. - - Raises ``IOError`` on read failure. - """ - ... - - def metadata(self) -> dict[str, int]: - """Return file metadata. Currently exposes ``size`` (bytes as int).""" - ... - + def location(self) -> str: ... + def exists(self) -> bool: ... + def read(self) -> bytes: ... + def metadata(self) -> dict[str, int]: ... def __repr__(self) -> str: ... - class OutputFile: - """Handle for writing a single file. Obtained via ``FileIO.new_output(path)``.""" - - def location(self) -> str: - """The absolute path this output file was opened on.""" - ... - - def write(self, data: bytes) -> None: - """Write ``data`` to the file, replacing any existing content. - - Raises ``IOError`` on write failure. - """ - ... - + def location(self) -> str: ... + def write(self, data: bytes) -> None: ... def __repr__(self) -> str: ... diff --git a/bindings/python/python/pyiceberg_core/py.typed b/bindings/python/python/pyiceberg_core/py.typed new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/bindings/python/python/pyiceberg_core/py.typed @@ -0,0 +1 @@ + diff --git a/bindings/python/src/file_io.rs b/bindings/python/src/file_io.rs index ba7537a3a6..983f38a4ce 100644 --- a/bindings/python/src/file_io.rs +++ b/bindings/python/src/file_io.rs @@ -18,7 +18,6 @@ use std::collections::HashMap; use std::sync::Arc; -use bytes::Bytes; use iceberg::io::{FileIO, FileIOBuilder, InputFile, OutputFile}; use iceberg_storage_opendal::OpenDalResolvingStorageFactory; use pyo3::exceptions::PyIOError; @@ -27,7 +26,6 @@ use pyo3::types::PyDict; use crate::runtime::runtime; -/// Keys whose values must be redacted in __repr__ to avoid leaking credentials. fn is_sensitive_key(key: &str) -> bool { let lower = key.to_lowercase(); lower.contains("secret") @@ -42,47 +40,33 @@ fn is_sensitive_key(key: &str) -> bool { #[derive(Clone)] pub struct PyFileIO { inner: FileIO, - /// A copy of the original props used at construction, for __repr__. props: HashMap, } #[pymethods] impl PyFileIO { - /// Construct a `FileIO` handle from a dict of storage properties. - /// - /// The property keys are the same ones iceberg-rust's `FileIOBuilder` recognizes - /// (e.g. `s3.region`, `s3.access-key-id`). For local-filesystem paths use - /// `file://…` URIs with an empty dict. - /// - /// The `FileIO` instance is lazily initialized on first use and cached, so - /// constructing once and reusing across many file opens amortizes the setup cost. #[staticmethod] fn from_props(props: HashMap) -> PyResult { let factory = Arc::new(OpenDalResolvingStorageFactory::new()); - let file_io = FileIOBuilder::new(factory) - .with_props(props.clone()) - .build(); + let file_io = FileIOBuilder::new(factory).with_props(props.clone()).build(); Ok(PyFileIO { inner: file_io, props, }) } - /// Check whether a file exists at the given path. fn exists(&self, path: String) -> PyResult { runtime() .block_on(self.inner.exists(&path)) .map_err(|e| PyIOError::new_err(e.to_string())) } - /// Delete the file at the given path. fn delete(&self, path: String) -> PyResult<()> { runtime() .block_on(self.inner.delete(&path)) .map_err(|e| PyIOError::new_err(e.to_string())) } - /// Open the file at `path` for reading and return a `InputFile` handle. fn new_input(&self, path: String) -> PyResult { let input = self .inner @@ -91,7 +75,6 @@ impl PyFileIO { Ok(PyInputFile { inner: input }) } - /// Open the file at `path` for writing and return an `OutputFile` handle. fn new_output(&self, path: String) -> PyResult { let output = self .inner @@ -101,7 +84,6 @@ impl PyFileIO { } fn __repr__(&self) -> String { - // Build a summary of the props, redacting sensitive values. let mut pairs: Vec = self .props .iter() @@ -114,7 +96,7 @@ impl PyFileIO { format!("{k}={display}") }) .collect(); - pairs.sort(); // deterministic output + pairs.sort(); if pairs.is_empty() { "FileIO()".to_string() } else { @@ -123,9 +105,6 @@ impl PyFileIO { } } -/// A handle for reading a single file. -/// -/// Obtained via `FileIO.new_input(path)`. #[pyclass(name = "InputFile", module = "pyiceberg_core.file_io")] pub struct PyInputFile { inner: InputFile, @@ -133,19 +112,16 @@ pub struct PyInputFile { #[pymethods] impl PyInputFile { - /// The absolute path this input file was opened on. fn location(&self) -> &str { self.inner.location() } - /// Return `True` if the file exists in the underlying storage. fn exists(&self) -> PyResult { runtime() .block_on(self.inner.exists()) .map_err(|e| PyIOError::new_err(e.to_string())) } - /// Read the entire file content and return it as `bytes`. fn read(&self) -> PyResult> { let bytes = runtime() .block_on(self.inner.read()) @@ -153,7 +129,6 @@ impl PyInputFile { Ok(bytes.to_vec()) } - /// Return a dict with file metadata. Currently exposes `size` (bytes). fn metadata<'py>(&self, py: Python<'py>) -> PyResult> { let meta = runtime() .block_on(self.inner.metadata()) @@ -168,9 +143,6 @@ impl PyInputFile { } } -/// A handle for writing a single file. -/// -/// Obtained via `FileIO.new_output(path)`. #[pyclass(name = "OutputFile", module = "pyiceberg_core.file_io")] pub struct PyOutputFile { inner: OutputFile, @@ -178,16 +150,13 @@ pub struct PyOutputFile { #[pymethods] impl PyOutputFile { - /// The absolute path this output file was opened on. fn location(&self) -> &str { self.inner.location() } - /// Write `data` to the file, replacing any existing content. fn write(&self, data: &[u8]) -> PyResult<()> { - let bs = Bytes::copy_from_slice(data); runtime() - .block_on(self.inner.write(bs)) + .block_on(self.inner.write(data.to_vec().into())) .map_err(|e| PyIOError::new_err(e.to_string())) } diff --git a/bindings/python/tests/test_file_io.py b/bindings/python/tests/test_file_io.py index bf9858e44f..a3d2ca1653 100644 --- a/bindings/python/tests/test_file_io.py +++ b/bindings/python/tests/test_file_io.py @@ -15,250 +15,136 @@ # specific language governing permissions and limitations # under the License. -"""pytest tests for the PyFileIO / PyInputFile / PyOutputFile binding. - -All tests use the local filesystem via ``file://`` URIs and ``tmp_path`` for -isolation; no network dependencies. -""" - import pytest from pyiceberg_core.file_io import FileIO, InputFile, OutputFile -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - def local_fio() -> FileIO: - """FileIO handle backed by local filesystem (empty props, file:// URIs).""" return FileIO.from_props({}) def file_uri(path) -> str: - """Convert a pathlib.Path to an absolute file:// URI.""" return f"file://{path}" -# --------------------------------------------------------------------------- -# Construction -# --------------------------------------------------------------------------- - - -def test_from_props_empty_dict(): - """Constructing with an empty dict must succeed without raising.""" - fio = FileIO.from_props({}) - assert isinstance(fio, FileIO) - - -def test_from_props_with_region_only(): - """Supplying only a region property must not raise at construction time. - - FileIO is lazily initialised; actual S3 credential errors only surface on - first I/O — not at from_props(). - """ - fio = FileIO.from_props({"s3.region": "us-east-1"}) - assert isinstance(fio, FileIO) - - def test_from_props_returns_independent_handles(): - """Two calls to from_props must return distinct objects.""" a = FileIO.from_props({}) - b = FileIO.from_props({}) + b = FileIO.from_props({"s3.region": "us-east-1"}) + assert isinstance(a, FileIO) + assert isinstance(b, FileIO) assert a is not b -# --------------------------------------------------------------------------- -# __repr__ — credential redaction -# --------------------------------------------------------------------------- - -CREDENTIAL_CASES = [ - ("s3.access-key-id", "AKIAIOSFODNN7EXAMPLE"), - ("s3.secret-access-key", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"), - ("gcs.service.account.private.key", "BEGIN PRIVATE KEY"), - ("token", "mytoken123"), - ("password", "hunter2"), - ("credential", "cred123"), - ("passphrase", "ssh-passphrase"), -] - +@pytest.mark.parametrize( + "key,value", + [ + ("s3.access-key-id", "AKIAIOSFODNN7EXAMPLE"), + ("s3.secret-access-key", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"), + ("gcs.service.account.private.key", "BEGIN PRIVATE KEY"), + ("token", "mytoken123"), + ("password", "hunter2"), + ("credential", "cred123"), + ("passphrase", "ssh-passphrase"), + ], +) +def test_repr_redacts_sensitive_values(key, value): + r = repr(FileIO.from_props({key: value, "s3.region": "us-east-1"})) + assert value not in r + assert "s3.region=us-east-1" in r + assert f"{key}=" in r -@pytest.mark.parametrize("key,value", CREDENTIAL_CASES) -def test_repr_redacts_sensitive_key(key, value): - fio = FileIO.from_props({key: value}) - r = repr(fio) - assert value not in r, f"repr leaked value for key {key!r}: {r!r}" - assert "" in r - -def test_repr_shows_non_sensitive_values(): - fio = FileIO.from_props({"s3.region": "us-east-1", "warehouse": "s3://mybucket"}) - r = repr(fio) - assert "us-east-1" in r - assert "s3://mybucket" in r +def test_repr_shows_plain_values(): + r = repr(FileIO.from_props({"warehouse": "s3://bucket", "s3.region": "us-east-1"})) + assert "s3.region=us-east-1" in r + assert "warehouse=s3://bucket" in r def test_repr_empty_props(): - fio = FileIO.from_props({}) - assert repr(fio) == "FileIO()" - - -def test_repr_mixed_sensitive_and_plain(): - fio = FileIO.from_props( - { - "s3.region": "eu-west-1", - "s3.access-key-id": "SECRET", - "warehouse": "s3://bucket", - } - ) - r = repr(fio) - assert "SECRET" not in r - assert "eu-west-1" in r - assert "s3://bucket" in r - - -# --------------------------------------------------------------------------- -# exists / delete via FileIO -# --------------------------------------------------------------------------- - - -def test_exists_returns_false_for_missing_file(tmp_path): - fio = local_fio() - uri = file_uri(tmp_path / "nonexistent.txt") - assert fio.exists(uri) is False + assert repr(FileIO.from_props({})) == "FileIO()" -def test_exists_returns_true_after_write(tmp_path): +def test_new_file_handles_expose_location(tmp_path): fio = local_fio() - uri = file_uri(tmp_path / "hello.txt") - fio.new_output(uri).write(b"data") - assert fio.exists(uri) is True - + uri = file_uri(tmp_path / "data.bin") -def test_delete_removes_file(tmp_path): - fio = local_fio() - uri = file_uri(tmp_path / "to_delete.txt") - fio.new_output(uri).write(b"bye") - assert fio.exists(uri) is True - fio.delete(uri) - assert fio.exists(uri) is False - - -def test_delete_nonexistent_file_is_noop(tmp_path): - """Deleting a file that does not exist must not raise.""" - fio = local_fio() - uri = file_uri(tmp_path / "ghost.txt") - fio.delete(uri) # should not raise - - -# --------------------------------------------------------------------------- -# new_output / OutputFile.write -# --------------------------------------------------------------------------- - - -def test_new_output_returns_output_file(tmp_path): - fio = local_fio() - uri = file_uri(tmp_path / "out.txt") + inp = fio.new_input(uri) out = fio.new_output(uri) - assert isinstance(out, OutputFile) - -def test_output_file_location(tmp_path): - fio = local_fio() - uri = file_uri(tmp_path / "loc.txt") - out = fio.new_output(uri) + assert isinstance(inp, InputFile) + assert isinstance(out, OutputFile) + assert inp.location() == uri assert out.location() == uri -def test_output_write_creates_file(tmp_path): +@pytest.mark.parametrize( + "payload", + [ + b"", + b"x", + b"\x00\x01\x02iceberg\xff", + bytes(range(256)), + ], +) +def test_write_read_exists_and_metadata_round_trip(tmp_path, payload): fio = local_fio() - path = tmp_path / "created.txt" - fio.new_output(file_uri(path)).write(b"hello") - assert path.exists() - assert path.read_bytes() == b"hello" - - -def test_output_write_overwrites_existing(tmp_path): - fio = local_fio() - path = tmp_path / "overwrite.txt" + path = tmp_path / "round-trip.bin" uri = file_uri(path) - fio.new_output(uri).write(b"first") - fio.new_output(uri).write(b"second") - assert path.read_bytes() == b"second" + assert fio.exists(uri) is False + assert fio.new_input(uri).exists() is False -def test_output_write_empty_bytes(tmp_path): - fio = local_fio() - path = tmp_path / "empty.txt" - fio.new_output(file_uri(path)).write(b"") - assert path.read_bytes() == b"" - + fio.new_output(uri).write(payload) -# --------------------------------------------------------------------------- -# new_input / InputFile.read / exists / metadata -# --------------------------------------------------------------------------- + assert fio.exists(uri) is True + assert fio.new_input(uri).exists() is True + assert fio.new_input(uri).read() == payload + assert fio.new_input(uri).metadata() == {"size": len(payload)} + assert path.read_bytes() == payload -def test_new_input_returns_input_file(tmp_path): - fio = local_fio() - path = tmp_path / "read.txt" - path.write_bytes(b"content") - inp = fio.new_input(file_uri(path)) - assert isinstance(inp, InputFile) +def test_write_replaces_existing_file(tmp_path): + path = tmp_path / "overwrite.txt" + out = local_fio().new_output(file_uri(path)) + out.write(b"first") + out.write(b"second") -def test_input_file_location(tmp_path): - fio = local_fio() - path = tmp_path / "loc.txt" - path.write_bytes(b"x") - uri = file_uri(path) - assert fio.new_input(uri).location() == uri + assert path.read_bytes() == b"second" -def test_input_exists_true(tmp_path): +def test_delete_removes_file_and_missing_delete_is_noop(tmp_path): fio = local_fio() - path = tmp_path / "present.txt" - path.write_bytes(b"here") - assert fio.new_input(file_uri(path)).exists() is True - + uri = file_uri(tmp_path / "delete-me.txt") -def test_input_exists_false(tmp_path): - fio = local_fio() - uri = file_uri(tmp_path / "absent.txt") - assert fio.new_input(uri).exists() is False + fio.new_output(uri).write(b"bye") + fio.delete(uri) + fio.delete(uri) + assert fio.exists(uri) is False -def test_round_trip_write_then_read(tmp_path): - """Write via OutputFile; read back via InputFile; bytes must match.""" - fio = local_fio() - uri = file_uri(tmp_path / "round.bin") - payload = b"\x00\x01\x02iceberg\xff" - fio.new_output(uri).write(payload) - assert fio.new_input(uri).read() == payload +@pytest.mark.parametrize("method", ["read", "metadata"]) +def test_missing_input_operations_raise_io_error(tmp_path, method): + inp = local_fio().new_input(file_uri(tmp_path / "missing.txt")) -def test_input_metadata_size(tmp_path): - fio = local_fio() - path = tmp_path / "meta.txt" - content = b"size-check" - path.write_bytes(content) - meta = fio.new_input(file_uri(path)).metadata() - assert isinstance(meta, dict) - assert meta["size"] == len(content) + with pytest.raises(OSError): + getattr(inp, method)() -def test_input_repr(tmp_path): - fio = local_fio() - uri = file_uri(tmp_path / "repr.txt") - r = repr(fio.new_input(uri)) - assert "InputFile" in r - assert "repr.txt" in r +def test_writing_directory_raises_io_error(tmp_path): + with pytest.raises(OSError): + local_fio().new_output(file_uri(tmp_path)).write(b"data") -def test_output_repr(tmp_path): +def test_file_handle_repr_names_type_and_location(tmp_path): fio = local_fio() uri = file_uri(tmp_path / "repr.txt") - r = repr(fio.new_output(uri)) - assert "OutputFile" in r - assert "repr.txt" in r + input_repr = repr(fio.new_input(uri)) + output_repr = repr(fio.new_output(uri)) + + assert "InputFile" in input_repr + assert uri in input_repr + assert "OutputFile" in output_repr + assert uri in output_repr From 718c5e2c2781365571ae45d4b6de72d477090faa Mon Sep 17 00:00:00 2001 From: Abanoub Doss Date: Sun, 24 May 2026 18:14:17 -0500 Subject: [PATCH 6/7] fix(python): release GIL during FileIO operations --- .../python/pyiceberg_core/datafusion.pyi | 27 +++++++ .../python/python/pyiceberg_core/file_io.pyi | 2 +- .../python/python/pyiceberg_core/manifest.pyi | 81 +++++++++++++++++++ .../python/pyiceberg_core/transform.pyi | 27 +++++++ bindings/python/src/file_io.rs | 74 +++++++++-------- bindings/python/tests/test_file_io.py | 20 +++-- 6 files changed, 190 insertions(+), 41 deletions(-) create mode 100644 bindings/python/python/pyiceberg_core/datafusion.pyi create mode 100644 bindings/python/python/pyiceberg_core/manifest.pyi create mode 100644 bindings/python/python/pyiceberg_core/transform.pyi diff --git a/bindings/python/python/pyiceberg_core/datafusion.pyi b/bindings/python/python/pyiceberg_core/datafusion.pyi new file mode 100644 index 0000000000..a4ce8c5fd6 --- /dev/null +++ b/bindings/python/python/pyiceberg_core/datafusion.pyi @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any + +class IcebergDataFusionTable: + def __init__( + self, + identifier: list[str], + metadata_location: str, + file_io_properties: dict[str, str] | None = None, + ) -> None: ... + def __datafusion_table_provider__(self, session: Any) -> Any: ... diff --git a/bindings/python/python/pyiceberg_core/file_io.pyi b/bindings/python/python/pyiceberg_core/file_io.pyi index 80b90c32e1..c3a7942839 100644 --- a/bindings/python/python/pyiceberg_core/file_io.pyi +++ b/bindings/python/python/pyiceberg_core/file_io.pyi @@ -28,7 +28,7 @@ class InputFile: def location(self) -> str: ... def exists(self) -> bool: ... def read(self) -> bytes: ... - def metadata(self) -> dict[str, int]: ... + def size(self) -> int: ... def __repr__(self) -> str: ... class OutputFile: diff --git a/bindings/python/python/pyiceberg_core/manifest.pyi b/bindings/python/python/pyiceberg_core/manifest.pyi new file mode 100644 index 0000000000..06061c68c0 --- /dev/null +++ b/bindings/python/python/pyiceberg_core/manifest.pyi @@ -0,0 +1,81 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any + +class PyManifest: + def entries(self) -> list[PyManifestEntry]: ... + +class PyFieldSummary: + @property + def contains_null(self) -> bool: ... + @property + def contains_nan(self) -> bool | None: ... + @property + def lower_bound(self) -> bytes | None: ... + @property + def upper_bound(self) -> bytes | None: ... + +class PyManifestFile: + @property + def manifest_path(self) -> str: ... + @property + def manifest_length(self) -> int: ... + @property + def partition_spec_id(self) -> int: ... + @property + def content(self) -> int: ... + @property + def sequence_number(self) -> int: ... + @property + def min_sequence_number(self) -> int: ... + @property + def added_snapshot_id(self) -> int: ... + @property + def added_files_count(self) -> int | None: ... + @property + def existing_files_count(self) -> int | None: ... + @property + def deleted_files_count(self) -> int | None: ... + @property + def added_rows_count(self) -> int | None: ... + @property + def existing_rows_count(self) -> int | None: ... + @property + def deleted_rows_count(self) -> int | None: ... + @property + def partitions(self) -> list[PyFieldSummary]: ... + @property + def key_metadata(self) -> bytes | None: ... + +class PyManifestEntry: + @property + def status(self) -> int: ... + @property + def snapshot_id(self) -> int | None: ... + @property + def sequence_number(self) -> int | None: ... + @property + def file_sequence_number(self) -> int | None: ... + @property + def data_file(self) -> Any: ... + +class PyManifestList: + def entries(self) -> list[PyManifestFile]: ... + +def read_manifest_entries(bs: bytes) -> PyManifest: ... +def read_manifest_list(bs: bytes) -> PyManifestList: ... diff --git a/bindings/python/python/pyiceberg_core/transform.pyi b/bindings/python/python/pyiceberg_core/transform.pyi new file mode 100644 index 0000000000..91a34fa24c --- /dev/null +++ b/bindings/python/python/pyiceberg_core/transform.pyi @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any + +def identity(array: Any) -> Any: ... +def void(array: Any) -> Any: ... +def year(array: Any) -> Any: ... +def month(array: Any) -> Any: ... +def day(array: Any) -> Any: ... +def hour(array: Any) -> Any: ... +def bucket(array: Any, num_buckets: int) -> Any: ... +def truncate(array: Any, width: int) -> Any: ... diff --git a/bindings/python/src/file_io.rs b/bindings/python/src/file_io.rs index 983f38a4ce..f3b747dd4f 100644 --- a/bindings/python/src/file_io.rs +++ b/bindings/python/src/file_io.rs @@ -22,7 +22,7 @@ use iceberg::io::{FileIO, FileIOBuilder, InputFile, OutputFile}; use iceberg_storage_opendal::OpenDalResolvingStorageFactory; use pyo3::exceptions::PyIOError; use pyo3::prelude::*; -use pyo3::types::PyDict; +use pyo3::types::PyBytes; use crate::runtime::runtime; @@ -36,7 +36,7 @@ fn is_sensitive_key(key: &str) -> bool { || lower.contains("passphrase") } -#[pyclass(name = "FileIO", module = "pyiceberg_core.file_io", from_py_object)] +#[pyclass(name = "FileIO", module = "pyiceberg_core.file_io", skip_from_py_object)] #[derive(Clone)] pub struct PyFileIO { inner: FileIO, @@ -55,16 +55,20 @@ impl PyFileIO { }) } - fn exists(&self, path: String) -> PyResult { - runtime() - .block_on(self.inner.exists(&path)) - .map_err(|e| PyIOError::new_err(e.to_string())) + fn exists(&self, py: Python<'_>, path: String) -> PyResult { + py.detach(|| { + runtime() + .block_on(self.inner.exists(&path)) + .map_err(|e| PyIOError::new_err(e.to_string())) + }) } - fn delete(&self, path: String) -> PyResult<()> { - runtime() - .block_on(self.inner.delete(&path)) - .map_err(|e| PyIOError::new_err(e.to_string())) + fn delete(&self, py: Python<'_>, path: String) -> PyResult<()> { + py.detach(|| { + runtime() + .block_on(self.inner.delete(&path)) + .map_err(|e| PyIOError::new_err(e.to_string())) + }) } fn new_input(&self, path: String) -> PyResult { @@ -116,26 +120,30 @@ impl PyInputFile { self.inner.location() } - fn exists(&self) -> PyResult { - runtime() - .block_on(self.inner.exists()) - .map_err(|e| PyIOError::new_err(e.to_string())) + fn exists(&self, py: Python<'_>) -> PyResult { + py.detach(|| { + runtime() + .block_on(self.inner.exists()) + .map_err(|e| PyIOError::new_err(e.to_string())) + }) } - fn read(&self) -> PyResult> { - let bytes = runtime() - .block_on(self.inner.read()) - .map_err(|e| PyIOError::new_err(e.to_string()))?; - Ok(bytes.to_vec()) + fn read<'py>(&self, py: Python<'py>) -> PyResult> { + let bytes = py.detach(|| { + runtime() + .block_on(self.inner.read()) + .map_err(|e| PyIOError::new_err(e.to_string())) + })?; + Ok(PyBytes::new(py, &bytes)) } - fn metadata<'py>(&self, py: Python<'py>) -> PyResult> { - let meta = runtime() - .block_on(self.inner.metadata()) - .map_err(|e| PyIOError::new_err(e.to_string()))?; - let d = PyDict::new(py); - d.set_item("size", meta.size)?; - Ok(d) + fn size(&self, py: Python<'_>) -> PyResult { + py.detach(|| { + runtime() + .block_on(self.inner.metadata()) + .map_err(|e| PyIOError::new_err(e.to_string())) + .map(|meta| meta.size) + }) } fn __repr__(&self) -> String { @@ -154,10 +162,13 @@ impl PyOutputFile { self.inner.location() } - fn write(&self, data: &[u8]) -> PyResult<()> { - runtime() - .block_on(self.inner.write(data.to_vec().into())) - .map_err(|e| PyIOError::new_err(e.to_string())) + fn write(&self, py: Python<'_>, data: &[u8]) -> PyResult<()> { + let data = data.to_vec(); + py.detach(|| { + runtime() + .block_on(self.inner.write(data.into())) + .map_err(|e| PyIOError::new_err(e.to_string())) + }) } fn __repr__(&self) -> String { @@ -173,6 +184,5 @@ pub fn register_module(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> m.add_submodule(&this)?; py.import("sys")? .getattr("modules")? - .set_item("pyiceberg_core.file_io", this)?; - Ok(()) + .set_item("pyiceberg_core.file_io", this) } diff --git a/bindings/python/tests/test_file_io.py b/bindings/python/tests/test_file_io.py index a3d2ca1653..3081817abe 100644 --- a/bindings/python/tests/test_file_io.py +++ b/bindings/python/tests/test_file_io.py @@ -87,7 +87,7 @@ def test_new_file_handles_expose_location(tmp_path): bytes(range(256)), ], ) -def test_write_read_exists_and_metadata_round_trip(tmp_path, payload): +def test_write_read_exists_and_size_round_trip(tmp_path, payload): fio = local_fio() path = tmp_path / "round-trip.bin" uri = file_uri(path) @@ -100,7 +100,7 @@ def test_write_read_exists_and_metadata_round_trip(tmp_path, payload): assert fio.exists(uri) is True assert fio.new_input(uri).exists() is True assert fio.new_input(uri).read() == payload - assert fio.new_input(uri).metadata() == {"size": len(payload)} + assert fio.new_input(uri).size() == len(payload) assert path.read_bytes() == payload @@ -114,18 +114,17 @@ def test_write_replaces_existing_file(tmp_path): assert path.read_bytes() == b"second" -def test_delete_removes_file_and_missing_delete_is_noop(tmp_path): +def test_delete_removes_file(tmp_path): fio = local_fio() uri = file_uri(tmp_path / "delete-me.txt") fio.new_output(uri).write(b"bye") fio.delete(uri) - fio.delete(uri) assert fio.exists(uri) is False -@pytest.mark.parametrize("method", ["read", "metadata"]) +@pytest.mark.parametrize("method", ["read", "size"]) def test_missing_input_operations_raise_io_error(tmp_path, method): inp = local_fio().new_input(file_uri(tmp_path / "missing.txt")) @@ -133,9 +132,14 @@ def test_missing_input_operations_raise_io_error(tmp_path, method): getattr(inp, method)() -def test_writing_directory_raises_io_error(tmp_path): - with pytest.raises(OSError): - local_fio().new_output(file_uri(tmp_path)).write(b"data") +def test_handle_is_reusable_across_many_opens(tmp_path): + fio = local_fio() + + for i in range(50): + fio.new_output(file_uri(tmp_path / f"f{i}")).write(b"x") + + for i in range(50): + assert fio.new_input(file_uri(tmp_path / f"f{i}")).read() == b"x" def test_file_handle_repr_names_type_and_location(tmp_path): From f4c0bcc6830d0af5c3919ae3dfb2c40e12d28092 Mon Sep 17 00:00:00 2001 From: Abanoub Doss Date: Sun, 24 May 2026 18:39:54 -0500 Subject: [PATCH 7/7] chore(python): defer file io typing stubs --- .../python/pyiceberg_core/datafusion.pyi | 27 ------- .../python/python/pyiceberg_core/file_io.pyi | 37 --------- .../python/python/pyiceberg_core/manifest.pyi | 81 ------------------- .../python/python/pyiceberg_core/py.typed | 1 - .../python/pyiceberg_core/transform.pyi | 27 ------- 5 files changed, 173 deletions(-) delete mode 100644 bindings/python/python/pyiceberg_core/datafusion.pyi delete mode 100644 bindings/python/python/pyiceberg_core/file_io.pyi delete mode 100644 bindings/python/python/pyiceberg_core/manifest.pyi delete mode 100644 bindings/python/python/pyiceberg_core/py.typed delete mode 100644 bindings/python/python/pyiceberg_core/transform.pyi diff --git a/bindings/python/python/pyiceberg_core/datafusion.pyi b/bindings/python/python/pyiceberg_core/datafusion.pyi deleted file mode 100644 index a4ce8c5fd6..0000000000 --- a/bindings/python/python/pyiceberg_core/datafusion.pyi +++ /dev/null @@ -1,27 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import Any - -class IcebergDataFusionTable: - def __init__( - self, - identifier: list[str], - metadata_location: str, - file_io_properties: dict[str, str] | None = None, - ) -> None: ... - def __datafusion_table_provider__(self, session: Any) -> Any: ... diff --git a/bindings/python/python/pyiceberg_core/file_io.pyi b/bindings/python/python/pyiceberg_core/file_io.pyi deleted file mode 100644 index c3a7942839..0000000000 --- a/bindings/python/python/pyiceberg_core/file_io.pyi +++ /dev/null @@ -1,37 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -class FileIO: - @staticmethod - def from_props(props: dict[str, str]) -> "FileIO": ... - def exists(self, path: str) -> bool: ... - def delete(self, path: str) -> None: ... - def new_input(self, path: str) -> "InputFile": ... - def new_output(self, path: str) -> "OutputFile": ... - def __repr__(self) -> str: ... - -class InputFile: - def location(self) -> str: ... - def exists(self) -> bool: ... - def read(self) -> bytes: ... - def size(self) -> int: ... - def __repr__(self) -> str: ... - -class OutputFile: - def location(self) -> str: ... - def write(self, data: bytes) -> None: ... - def __repr__(self) -> str: ... diff --git a/bindings/python/python/pyiceberg_core/manifest.pyi b/bindings/python/python/pyiceberg_core/manifest.pyi deleted file mode 100644 index 06061c68c0..0000000000 --- a/bindings/python/python/pyiceberg_core/manifest.pyi +++ /dev/null @@ -1,81 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import Any - -class PyManifest: - def entries(self) -> list[PyManifestEntry]: ... - -class PyFieldSummary: - @property - def contains_null(self) -> bool: ... - @property - def contains_nan(self) -> bool | None: ... - @property - def lower_bound(self) -> bytes | None: ... - @property - def upper_bound(self) -> bytes | None: ... - -class PyManifestFile: - @property - def manifest_path(self) -> str: ... - @property - def manifest_length(self) -> int: ... - @property - def partition_spec_id(self) -> int: ... - @property - def content(self) -> int: ... - @property - def sequence_number(self) -> int: ... - @property - def min_sequence_number(self) -> int: ... - @property - def added_snapshot_id(self) -> int: ... - @property - def added_files_count(self) -> int | None: ... - @property - def existing_files_count(self) -> int | None: ... - @property - def deleted_files_count(self) -> int | None: ... - @property - def added_rows_count(self) -> int | None: ... - @property - def existing_rows_count(self) -> int | None: ... - @property - def deleted_rows_count(self) -> int | None: ... - @property - def partitions(self) -> list[PyFieldSummary]: ... - @property - def key_metadata(self) -> bytes | None: ... - -class PyManifestEntry: - @property - def status(self) -> int: ... - @property - def snapshot_id(self) -> int | None: ... - @property - def sequence_number(self) -> int | None: ... - @property - def file_sequence_number(self) -> int | None: ... - @property - def data_file(self) -> Any: ... - -class PyManifestList: - def entries(self) -> list[PyManifestFile]: ... - -def read_manifest_entries(bs: bytes) -> PyManifest: ... -def read_manifest_list(bs: bytes) -> PyManifestList: ... diff --git a/bindings/python/python/pyiceberg_core/py.typed b/bindings/python/python/pyiceberg_core/py.typed deleted file mode 100644 index 8b13789179..0000000000 --- a/bindings/python/python/pyiceberg_core/py.typed +++ /dev/null @@ -1 +0,0 @@ - diff --git a/bindings/python/python/pyiceberg_core/transform.pyi b/bindings/python/python/pyiceberg_core/transform.pyi deleted file mode 100644 index 91a34fa24c..0000000000 --- a/bindings/python/python/pyiceberg_core/transform.pyi +++ /dev/null @@ -1,27 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import Any - -def identity(array: Any) -> Any: ... -def void(array: Any) -> Any: ... -def year(array: Any) -> Any: ... -def month(array: Any) -> Any: ... -def day(array: Any) -> Any: ... -def hour(array: Any) -> Any: ... -def bucket(array: Any, num_buckets: int) -> Any: ... -def truncate(array: Any, width: int) -> Any: ...