From 14fdbf6d509fdf6f9f13ba1f868e97fc8f4d6fc9 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 24 Feb 2026 12:41:37 -0800 Subject: [PATCH 01/16] add more delta packed benches --- parquet/benches/arrow_reader.rs | 77 +++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 2ea0706e3517..25bae1441851 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -326,6 +326,53 @@ where InMemoryPageIterator::new(pages) } +fn build_delta_encoded_incr_primitive_page_iterator( + column_desc: ColumnDescPtr, + null_density: f32, + increment: usize, +) -> impl PageIterator + Clone +where + T: parquet::data_type::DataType, + T::T: SampleUniform + FromPrimitive, +{ + let max_def_level = column_desc.max_def_level(); + let max_rep_level = column_desc.max_rep_level(); + let rep_levels = vec![0; VALUES_PER_PAGE]; + let mut rng = seedable_rng(); + let mut pages: Vec> = Vec::new(); + let mut running_val: usize = 1; + for _i in 0..NUM_ROW_GROUPS { + let mut column_chunk_pages = Vec::new(); + for _j in 0..PAGES_PER_GROUP { + // generate page + let mut values = Vec::with_capacity(VALUES_PER_PAGE); + let mut def_levels = Vec::with_capacity(VALUES_PER_PAGE); + for _k in 0..VALUES_PER_PAGE { + let def_level = if rng.random::() < null_density { + max_def_level - 1 + } else { + max_def_level + }; + if def_level == max_def_level { + let value = FromPrimitive::from_usize(running_val).unwrap(); + running_val += increment; + values.push(value); + } + def_levels.push(def_level); + } + let mut page_builder = + DataPageBuilderImpl::new(column_desc.clone(), values.len() as u32, true); + page_builder.add_rep_levels(max_rep_level, &rep_levels); + page_builder.add_def_levels(max_def_level, &def_levels); + page_builder.add_values::(Encoding::DELTA_BINARY_PACKED, &values); + column_chunk_pages.push(page_builder.consume()); + } + pages.push(column_chunk_pages); + } + + InMemoryPageIterator::new(pages) +} + fn build_dictionary_encoded_primitive_page_iterator( column_desc: ColumnDescPtr, null_density: f32, @@ -1061,6 +1108,36 @@ fn bench_primitive( assert_eq!(count, EXPECTED_VALUE_COUNT); }); + // binary packed same value + let data = build_delta_encoded_incr_primitive_page_iterator::( + mandatory_column_desc.clone(), + 0.0, + 0, + ); + group.bench_function("binary packed single value", |b| { + b.iter(|| { + let array_reader = + create_primitive_array_reader(data.clone(), mandatory_column_desc.clone()); + count = bench_array_reader_skip(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + + // binary packed monotonically increasing + let data = build_delta_encoded_incr_primitive_page_iterator::( + mandatory_column_desc.clone(), + 0.0, + 1, + ); + group.bench_function("binary packed increasing value", |b| { + b.iter(|| { + let array_reader = + create_primitive_array_reader(data.clone(), mandatory_column_desc.clone()); + count = bench_array_reader_skip(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + let data = build_encoded_primitive_page_iterator::( optional_column_desc.clone(), 0.0, From 2610ce2decc166182d608364c5524bd8e92563f4 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 2 Mar 2026 09:55:30 -0800 Subject: [PATCH 02/16] add delta length byte array with constant length --- parquet/benches/arrow_reader.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 25bae1441851..812f19b93987 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -1424,6 +1424,20 @@ fn decimal_benches(c: &mut Criterion) { ); group.finish(); + let mut group = c.benchmark_group("arrow_array_reader/BYTE_ARRAY/Decimal128Array"); + let mandatory_decimal3_leaf_desc = schema.column(10); + let optional_decimal3_leaf_desc = schema.column(11); + bench_byte_decimal::( + &mut group, + &mandatory_decimal3_leaf_desc, + &optional_decimal3_leaf_desc, + Encoding::DELTA_LENGTH_BYTE_ARRAY, + // precision is 16: the max is 9999999999999999 + 9999999999999000, + 9999999999999999, + ); + group.finish(); + // parquet FIXED_LEN_BYTE_ARRAY, logical type decimal(16,2) let mut group = c.benchmark_group("arrow_array_reader/FIXED_LEN_BYTE_ARRAY/Decimal128Array"); let mandatory_decimal4_leaf_desc = schema.column(12); From 9fd7912229616b26d414a4e6a2c62e00c9d437b1 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 2 Mar 2026 10:34:23 -0800 Subject: [PATCH 03/16] add delta byte array benches --- parquet/benches/arrow_reader.rs | 79 +++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 812f19b93987..081901b52587 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -486,6 +486,51 @@ fn build_plain_encoded_byte_array_page_iterator_inner( InMemoryPageIterator::new(pages) } +fn build_constant_prefix_delta_encoded_byte_array_page_iterator( + column_desc: ColumnDescPtr, + null_density: f32, + const_string: bool, +) -> impl PageIterator + Clone { + let max_def_level = column_desc.max_def_level(); + let max_rep_level = column_desc.max_rep_level(); + let rep_levels = vec![0; VALUES_PER_PAGE]; + let mut rng = seedable_rng(); + let mut pages: Vec> = Vec::new(); + for i in 0..NUM_ROW_GROUPS { + let mut column_chunk_pages = Vec::new(); + for j in 0..PAGES_PER_GROUP { + // generate page + let mut values = Vec::with_capacity(VALUES_PER_PAGE); + let mut def_levels = Vec::with_capacity(VALUES_PER_PAGE); + for k in 0..VALUES_PER_PAGE { + let def_level = if rng.random::() < null_density { + max_def_level - 1 + } else { + max_def_level + }; + if def_level == max_def_level { + let string_value = if const_string { + "01234567890123456789012345678901".to_string() + } else { + format!("01234567890123456789012345678901:{k}{j}{i}") + }; + values.push(parquet::data_type::ByteArray::from(string_value.as_str())); + } + def_levels.push(def_level); + } + let mut page_builder = + DataPageBuilderImpl::new(column_desc.clone(), values.len() as u32, true); + page_builder.add_rep_levels(max_rep_level, &rep_levels); + page_builder.add_def_levels(max_def_level, &def_levels); + page_builder.add_values::(Encoding::DELTA_BYTE_ARRAY, &values); + column_chunk_pages.push(page_builder.consume()); + } + pages.push(column_chunk_pages); + } + + InMemoryPageIterator::new(pages) +} + fn build_plain_encoded_byte_array_page_iterator( column_desc: ColumnDescPtr, null_density: f32, @@ -1685,6 +1730,40 @@ fn add_benches(c: &mut Criterion) { assert_eq!(count, EXPECTED_VALUE_COUNT); }); + let delta_string_const_no_null_data = + build_constant_prefix_delta_encoded_byte_array_page_iterator( + mandatory_string_column_desc.clone(), + 0.0, + true, + ); + group.bench_function("const delta byte array encoded, mandatory, no NULLs", |b| { + b.iter(|| { + let array_reader = create_byte_array_reader( + delta_string_const_no_null_data.clone(), + mandatory_string_column_desc.clone(), + ); + count = bench_array_reader(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + + let delta_string_const_prefix_no_null_data = + build_constant_prefix_delta_encoded_byte_array_page_iterator( + mandatory_string_column_desc.clone(), + 0.0, + false, + ); + group.bench_function("const prefix delta byte array encoded, mandatory, no NULLs", |b| { + b.iter(|| { + let array_reader = create_byte_array_reader( + delta_string_const_prefix_no_null_data.clone(), + mandatory_string_column_desc.clone(), + ); + count = bench_array_reader(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + group.finish(); // binary benchmarks From 11fb1496aa160b44fe5e3cb570c0d3e812e23c2e Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 2 Mar 2026 10:47:20 -0800 Subject: [PATCH 04/16] formatting --- parquet/benches/arrow_reader.rs | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 081901b52587..d8ccc82873d9 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -1753,16 +1753,19 @@ fn add_benches(c: &mut Criterion) { 0.0, false, ); - group.bench_function("const prefix delta byte array encoded, mandatory, no NULLs", |b| { - b.iter(|| { - let array_reader = create_byte_array_reader( - delta_string_const_prefix_no_null_data.clone(), - mandatory_string_column_desc.clone(), - ); - count = bench_array_reader(array_reader); - }); - assert_eq!(count, EXPECTED_VALUE_COUNT); - }); + group.bench_function( + "const prefix delta byte array encoded, mandatory, no NULLs", + |b| { + b.iter(|| { + let array_reader = create_byte_array_reader( + delta_string_const_prefix_no_null_data.clone(), + mandatory_string_column_desc.clone(), + ); + count = bench_array_reader(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }, + ); group.finish(); From 1361d8a6d1c3f5ce31bf61e85193d719e8113bde Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 2 Mar 2026 12:08:19 -0800 Subject: [PATCH 05/16] change where delta length byte array is tested --- parquet/benches/arrow_reader.rs | 43 +++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index d8ccc82873d9..08bdf54cd773 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -486,9 +486,10 @@ fn build_plain_encoded_byte_array_page_iterator_inner( InMemoryPageIterator::new(pages) } -fn build_constant_prefix_delta_encoded_byte_array_page_iterator( +fn build_constant_prefix_byte_array_page_iterator( column_desc: ColumnDescPtr, null_density: f32, + encoding: Encoding, const_string: bool, ) -> impl PageIterator + Clone { let max_def_level = column_desc.max_def_level(); @@ -522,7 +523,7 @@ fn build_constant_prefix_delta_encoded_byte_array_page_iterator( DataPageBuilderImpl::new(column_desc.clone(), values.len() as u32, true); page_builder.add_rep_levels(max_rep_level, &rep_levels); page_builder.add_def_levels(max_def_level, &def_levels); - page_builder.add_values::(Encoding::DELTA_BYTE_ARRAY, &values); + page_builder.add_values::(encoding, &values); column_chunk_pages.push(page_builder.consume()); } pages.push(column_chunk_pages); @@ -1469,20 +1470,6 @@ fn decimal_benches(c: &mut Criterion) { ); group.finish(); - let mut group = c.benchmark_group("arrow_array_reader/BYTE_ARRAY/Decimal128Array"); - let mandatory_decimal3_leaf_desc = schema.column(10); - let optional_decimal3_leaf_desc = schema.column(11); - bench_byte_decimal::( - &mut group, - &mandatory_decimal3_leaf_desc, - &optional_decimal3_leaf_desc, - Encoding::DELTA_LENGTH_BYTE_ARRAY, - // precision is 16: the max is 9999999999999999 - 9999999999999000, - 9999999999999999, - ); - group.finish(); - // parquet FIXED_LEN_BYTE_ARRAY, logical type decimal(16,2) let mut group = c.benchmark_group("arrow_array_reader/FIXED_LEN_BYTE_ARRAY/Decimal128Array"); let mandatory_decimal4_leaf_desc = schema.column(12); @@ -1731,9 +1718,10 @@ fn add_benches(c: &mut Criterion) { }); let delta_string_const_no_null_data = - build_constant_prefix_delta_encoded_byte_array_page_iterator( + build_constant_prefix_byte_array_page_iterator( mandatory_string_column_desc.clone(), 0.0, + Encoding::DELTA_BYTE_ARRAY, true, ); group.bench_function("const delta byte array encoded, mandatory, no NULLs", |b| { @@ -1748,9 +1736,10 @@ fn add_benches(c: &mut Criterion) { }); let delta_string_const_prefix_no_null_data = - build_constant_prefix_delta_encoded_byte_array_page_iterator( + build_constant_prefix_byte_array_page_iterator( mandatory_string_column_desc.clone(), 0.0, + Encoding::DELTA_BYTE_ARRAY, false, ); group.bench_function( @@ -1767,6 +1756,24 @@ fn add_benches(c: &mut Criterion) { }, ); + let delta_string_const_no_null_data = + build_constant_prefix_byte_array_page_iterator( + mandatory_string_column_desc.clone(), + 0.0, + Encoding::DELTA_LENGTH_BYTE_ARRAY, + true, + ); + group.bench_function("const delta length byte array encoded, mandatory, no NULLs", |b| { + b.iter(|| { + let array_reader = create_byte_array_reader( + delta_string_const_no_null_data.clone(), + mandatory_string_column_desc.clone(), + ); + count = bench_array_reader(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + group.finish(); // binary benchmarks From 41a2467beeca82370efd2cb41d3a843f2b618b37 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 2 Mar 2026 13:59:28 -0800 Subject: [PATCH 06/16] fix const prefix case --- parquet/benches/arrow_reader.rs | 64 ++++++++++++++++----------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 08bdf54cd773..5fda229d58a9 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -513,7 +513,7 @@ fn build_constant_prefix_byte_array_page_iterator( let string_value = if const_string { "01234567890123456789012345678901".to_string() } else { - format!("01234567890123456789012345678901:{k}{j}{i}") + format!("01234567890123456789012345678901:{:x}{j}{i}", (k % 16)) }; values.push(parquet::data_type::ByteArray::from(string_value.as_str())); } @@ -1717,13 +1717,12 @@ fn add_benches(c: &mut Criterion) { assert_eq!(count, EXPECTED_VALUE_COUNT); }); - let delta_string_const_no_null_data = - build_constant_prefix_byte_array_page_iterator( - mandatory_string_column_desc.clone(), - 0.0, - Encoding::DELTA_BYTE_ARRAY, - true, - ); + let delta_string_const_no_null_data = build_constant_prefix_byte_array_page_iterator( + mandatory_string_column_desc.clone(), + 0.0, + Encoding::DELTA_BYTE_ARRAY, + true, + ); group.bench_function("const delta byte array encoded, mandatory, no NULLs", |b| { b.iter(|| { let array_reader = create_byte_array_reader( @@ -1735,13 +1734,12 @@ fn add_benches(c: &mut Criterion) { assert_eq!(count, EXPECTED_VALUE_COUNT); }); - let delta_string_const_prefix_no_null_data = - build_constant_prefix_byte_array_page_iterator( - mandatory_string_column_desc.clone(), - 0.0, - Encoding::DELTA_BYTE_ARRAY, - false, - ); + let delta_string_const_prefix_no_null_data = build_constant_prefix_byte_array_page_iterator( + mandatory_string_column_desc.clone(), + 0.0, + Encoding::DELTA_BYTE_ARRAY, + false, + ); group.bench_function( "const prefix delta byte array encoded, mandatory, no NULLs", |b| { @@ -1756,23 +1754,25 @@ fn add_benches(c: &mut Criterion) { }, ); - let delta_string_const_no_null_data = - build_constant_prefix_byte_array_page_iterator( - mandatory_string_column_desc.clone(), - 0.0, - Encoding::DELTA_LENGTH_BYTE_ARRAY, - true, - ); - group.bench_function("const delta length byte array encoded, mandatory, no NULLs", |b| { - b.iter(|| { - let array_reader = create_byte_array_reader( - delta_string_const_no_null_data.clone(), - mandatory_string_column_desc.clone(), - ); - count = bench_array_reader(array_reader); - }); - assert_eq!(count, EXPECTED_VALUE_COUNT); - }); + let delta_string_const_no_null_data = build_constant_prefix_byte_array_page_iterator( + mandatory_string_column_desc.clone(), + 0.0, + Encoding::DELTA_LENGTH_BYTE_ARRAY, + true, + ); + group.bench_function( + "const delta length byte array encoded, mandatory, no NULLs", + |b| { + b.iter(|| { + let array_reader = create_byte_array_reader( + delta_string_const_no_null_data.clone(), + mandatory_string_column_desc.clone(), + ); + count = bench_array_reader(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }, + ); group.finish(); From e75f3347fbaa924dc10ae01cf95697f7bb283c76 Mon Sep 17 00:00:00 2001 From: seidl Date: Mon, 2 Mar 2026 14:47:19 -0800 Subject: [PATCH 07/16] switch up order --- parquet/benches/arrow_reader.rs | 37 ++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 5fda229d58a9..60100b0abddf 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -1717,23 +1717,7 @@ fn add_benches(c: &mut Criterion) { assert_eq!(count, EXPECTED_VALUE_COUNT); }); - let delta_string_const_no_null_data = build_constant_prefix_byte_array_page_iterator( - mandatory_string_column_desc.clone(), - 0.0, - Encoding::DELTA_BYTE_ARRAY, - true, - ); - group.bench_function("const delta byte array encoded, mandatory, no NULLs", |b| { - b.iter(|| { - let array_reader = create_byte_array_reader( - delta_string_const_no_null_data.clone(), - mandatory_string_column_desc.clone(), - ); - count = bench_array_reader(array_reader); - }); - assert_eq!(count, EXPECTED_VALUE_COUNT); - }); - + // delta byte array with constant prefix and suffix lengths let delta_string_const_prefix_no_null_data = build_constant_prefix_byte_array_page_iterator( mandatory_string_column_desc.clone(), 0.0, @@ -1754,6 +1738,25 @@ fn add_benches(c: &mut Criterion) { }, ); + // delta byte array with constant prefix and no suffix + let delta_string_const_no_null_data = build_constant_prefix_byte_array_page_iterator( + mandatory_string_column_desc.clone(), + 0.0, + Encoding::DELTA_BYTE_ARRAY, + true, + ); + group.bench_function("const delta byte array encoded, mandatory, no NULLs", |b| { + b.iter(|| { + let array_reader = create_byte_array_reader( + delta_string_const_no_null_data.clone(), + mandatory_string_column_desc.clone(), + ); + count = bench_array_reader(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + + // delta length byte array with constant lengths let delta_string_const_no_null_data = build_constant_prefix_byte_array_page_iterator( mandatory_string_column_desc.clone(), 0.0, From 7021eeee72f51cdf7b1b675e7bd1c3d37de9bd20 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 3 Mar 2026 08:55:42 -0800 Subject: [PATCH 08/16] add step function to delta tests to get min_delta=0 but bit_width > 0 --- parquet/benches/arrow_reader.rs | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 60100b0abddf..af131a8fb5bf 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -330,6 +330,7 @@ fn build_delta_encoded_incr_primitive_page_iterator( column_desc: ColumnDescPtr, null_density: f32, increment: usize, + stepped: bool, ) -> impl PageIterator + Clone where T: parquet::data_type::DataType, @@ -347,7 +348,7 @@ where // generate page let mut values = Vec::with_capacity(VALUES_PER_PAGE); let mut def_levels = Vec::with_capacity(VALUES_PER_PAGE); - for _k in 0..VALUES_PER_PAGE { + for k in 0..VALUES_PER_PAGE { let def_level = if rng.random::() < null_density { max_def_level - 1 } else { @@ -355,7 +356,11 @@ where }; if def_level == max_def_level { let value = FromPrimitive::from_usize(running_val).unwrap(); - running_val += increment; + running_val = if !stepped || k % 2 == 1 { + running_val + increment + } else { + running_val + }; values.push(value); } def_levels.push(def_level); @@ -1159,6 +1164,7 @@ fn bench_primitive( mandatory_column_desc.clone(), 0.0, 0, + false, ); group.bench_function("binary packed single value", |b| { b.iter(|| { @@ -1174,6 +1180,7 @@ fn bench_primitive( mandatory_column_desc.clone(), 0.0, 1, + false, ); group.bench_function("binary packed increasing value", |b| { b.iter(|| { @@ -1184,6 +1191,22 @@ fn bench_primitive( assert_eq!(count, EXPECTED_VALUE_COUNT); }); + // binary packed increasing stepped + let data = build_delta_encoded_incr_primitive_page_iterator::( + mandatory_column_desc.clone(), + 0.0, + 1, + true, + ); + group.bench_function("binary packed stepped increasing value", |b| { + b.iter(|| { + let array_reader = + create_primitive_array_reader(data.clone(), mandatory_column_desc.clone()); + count = bench_array_reader_skip(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + let data = build_encoded_primitive_page_iterator::( optional_column_desc.clone(), 0.0, From 7406ff1b43c0666c77d642295be423c2ca4b4320 Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 4 Mar 2026 12:51:59 -0800 Subject: [PATCH 09/16] add non-skip benches --- parquet/benches/arrow_reader.rs | 87 +++++++++++++++++++++++++-------- 1 file changed, 66 insertions(+), 21 deletions(-) diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index af131a8fb5bf..0efa13b687c0 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -1159,6 +1159,39 @@ fn bench_primitive( assert_eq!(count, EXPECTED_VALUE_COUNT); }); + let data = build_encoded_primitive_page_iterator::( + optional_column_desc.clone(), + 0.0, + Encoding::DELTA_BINARY_PACKED, + min, + max, + ); + group.bench_function("binary packed skip, optional, no NULLs", |b| { + b.iter(|| { + let array_reader = + create_primitive_array_reader(data.clone(), optional_column_desc.clone()); + count = bench_array_reader_skip(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + + // binary packed skip, half NULLs + let data = build_encoded_primitive_page_iterator::( + optional_column_desc.clone(), + 0.5, + Encoding::DELTA_BINARY_PACKED, + min, + max, + ); + group.bench_function("binary packed skip, optional, half NULLs", |b| { + b.iter(|| { + let array_reader = + create_primitive_array_reader(data.clone(), optional_column_desc.clone()); + count = bench_array_reader_skip(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + // binary packed same value let data = build_delta_encoded_incr_primitive_page_iterator::( mandatory_column_desc.clone(), @@ -1167,6 +1200,21 @@ fn bench_primitive( false, ); group.bench_function("binary packed single value", |b| { + b.iter(|| { + let array_reader = + create_primitive_array_reader(data.clone(), mandatory_column_desc.clone()); + count = bench_array_reader(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + + let data = build_delta_encoded_incr_primitive_page_iterator::( + mandatory_column_desc.clone(), + 0.0, + 0, + false, + ); + group.bench_function("binary skip packed single value", |b| { b.iter(|| { let array_reader = create_primitive_array_reader(data.clone(), mandatory_column_desc.clone()); @@ -1186,19 +1234,18 @@ fn bench_primitive( b.iter(|| { let array_reader = create_primitive_array_reader(data.clone(), mandatory_column_desc.clone()); - count = bench_array_reader_skip(array_reader); + count = bench_array_reader(array_reader); }); assert_eq!(count, EXPECTED_VALUE_COUNT); }); - // binary packed increasing stepped let data = build_delta_encoded_incr_primitive_page_iterator::( mandatory_column_desc.clone(), 0.0, 1, - true, + false, ); - group.bench_function("binary packed stepped increasing value", |b| { + group.bench_function("binary packed skip increasing value", |b| { b.iter(|| { let array_reader = create_primitive_array_reader(data.clone(), mandatory_column_desc.clone()); @@ -1207,34 +1254,32 @@ fn bench_primitive( assert_eq!(count, EXPECTED_VALUE_COUNT); }); - let data = build_encoded_primitive_page_iterator::( - optional_column_desc.clone(), + // binary packed increasing stepped + let data = build_delta_encoded_incr_primitive_page_iterator::( + mandatory_column_desc.clone(), 0.0, - Encoding::DELTA_BINARY_PACKED, - min, - max, + 1, + true, ); - group.bench_function("binary packed skip, optional, no NULLs", |b| { + group.bench_function("binary packed stepped increasing value", |b| { b.iter(|| { let array_reader = - create_primitive_array_reader(data.clone(), optional_column_desc.clone()); - count = bench_array_reader_skip(array_reader); + create_primitive_array_reader(data.clone(), mandatory_column_desc.clone()); + count = bench_array_reader(array_reader); }); assert_eq!(count, EXPECTED_VALUE_COUNT); }); - // binary packed skip, half NULLs - let data = build_encoded_primitive_page_iterator::( - optional_column_desc.clone(), - 0.5, - Encoding::DELTA_BINARY_PACKED, - min, - max, + let data = build_delta_encoded_incr_primitive_page_iterator::( + mandatory_column_desc.clone(), + 0.0, + 1, + true, ); - group.bench_function("binary packed skip, optional, half NULLs", |b| { + group.bench_function("binary packed skip stepped increasing value", |b| { b.iter(|| { let array_reader = - create_primitive_array_reader(data.clone(), optional_column_desc.clone()); + create_primitive_array_reader(data.clone(), mandatory_column_desc.clone()); count = bench_array_reader_skip(array_reader); }); assert_eq!(count, EXPECTED_VALUE_COUNT); From 11b7c698425c54876eedda5a1089c9f7a1c34459 Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 4 Mar 2026 13:08:49 -0800 Subject: [PATCH 10/16] typo --- parquet/benches/arrow_reader.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 0efa13b687c0..14fa16b3531e 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -1214,7 +1214,7 @@ fn bench_primitive( 0, false, ); - group.bench_function("binary skip packed single value", |b| { + group.bench_function("binary packed skip single value", |b| { b.iter(|| { let array_reader = create_primitive_array_reader(data.clone(), mandatory_column_desc.clone()); From 141f54142f977d2caa5a7cf8eb109bf4c428bb31 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 24 Feb 2026 12:41:37 -0800 Subject: [PATCH 11/16] delta binary packed optimization for bitwidth==0 --- parquet/src/encodings/decoding.rs | 71 +++++++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 9 deletions(-) diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 58430820a9b6..60009747cfae 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -770,15 +770,44 @@ where // At this point we have read the deltas to `buffer` we now need to offset // these to get back to the original values that were encoded - for v in &mut buffer[read..read + batch_read] { - // It is OK for deltas to contain "overflowed" values after encoding, - // e.g. i64::MAX - i64::MIN, so we use `wrapping_add` to "overflow" again and - // restore original value. - *v = v - .wrapping_add(&self.min_delta) - .wrapping_add(&self.last_value); - - self.last_value = *v; + // + // Optimization: if the bit_width for the miniblock is 0, then we can employ + // a faster decoding method than setting `value[i] = value[i-1] + value[i] + min_delta`. + // Where min_delta is 0 (all values in the miniblock are the same), we can simply + // set all values to `self.last_value`. In the case of non-zero min_delta (values + // in the mini-block form an arithmetic progression) each value can be computed via + // `value[i] = (i + 1) * min_delta + last_value`. In both cases we remove the + // dependence on the preceding value. + // Kudos to @pitrou for the idea https://github.com/apache/arrow/pull/49296 + if bit_width == 0 { + let min_delta = self.min_delta.as_i64()?; + if min_delta == 0 { + for v in &mut buffer[read..read + batch_read] { + *v = self.last_value; + } + } else { + // the c++ version multiplies min_delta by the iter index, but doing + // wrapping_mul through T::T was a bit slower. this is still + // faster than before. + let mut delta = self.min_delta; + for v in &mut buffer[read..read + batch_read] { + *v = self.last_value.wrapping_add(&delta); + delta = delta.wrapping_add(&self.min_delta); + } + + self.last_value = buffer[read + batch_read - 1]; + } + } else { + for v in &mut buffer[read..read + batch_read] { + // It is OK for deltas to contain "overflowed" values after encoding, + // e.g. i64::MAX - i64::MIN, so we use `wrapping_add` to "overflow" again and + // restore original value. + *v = v + .wrapping_add(&self.min_delta) + .wrapping_add(&self.last_value); + + self.last_value = *v; + } } read += batch_read; @@ -1802,6 +1831,30 @@ mod tests { ); } + #[test] + fn test_delta_bit_packed_int32_single_value_large() { + let block_data = vec![3; 10240]; + test_delta_bit_packed_decode::(vec![block_data]); + } + + #[test] + fn test_delta_bit_packed_int32_increasing_value_large() { + let block_data = (0i32..10240).collect(); + test_delta_bit_packed_decode::(vec![block_data]); + } + + #[test] + fn test_delta_bit_packed_int64_single_value_large() { + let block_data = vec![5; 10240]; + test_delta_bit_packed_decode::(vec![block_data]); + } + + #[test] + fn test_delta_bit_packed_int64_increasing_value_large() { + let block_data = (0i64..10240).collect(); + test_delta_bit_packed_decode::(vec![block_data]); + } + #[test] fn test_delta_byte_array_same_arrays() { let data = vec![ From 7fc5443b8e3cd420fbcb122916299aa33363b5eb Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 24 Feb 2026 16:50:40 -0800 Subject: [PATCH 12/16] add another test --- parquet/src/encodings/decoding.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 60009747cfae..2caa19a35a30 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -1843,6 +1843,25 @@ mod tests { test_delta_bit_packed_decode::(vec![block_data]); } + #[test] + fn test_delta_bit_packed_int32_mixed_large() { + // should be enough for 4 mini-blocks plus a little so we get some + // mixed mini-blocks + const BLOCK_SIZE: i32 = 133; + let block1_data = (0..BLOCK_SIZE).map(|i| (i * 7) % 11).collect(); + let block2_data = vec![3; BLOCK_SIZE as usize]; + let block3_data = (0..BLOCK_SIZE).map(|i| (i * 5) % 13).collect(); + let block4_data = (0..BLOCK_SIZE).collect(); + let block5_data = (0..BLOCK_SIZE).map(|i| (i * 3) % 17).collect(); + test_delta_bit_packed_decode::(vec![ + block1_data, + block2_data, + block3_data, + block4_data, + block5_data, + ]); + } + #[test] fn test_delta_bit_packed_int64_single_value_large() { let block_data = vec![5; 10240]; From 0b76f6d81cbe2ae75a9eb6379ac70ecc9d2ea9a9 Mon Sep 17 00:00:00 2001 From: seidl Date: Mon, 2 Mar 2026 17:34:19 -0800 Subject: [PATCH 13/16] use fill --- parquet/src/encodings/decoding.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 2caa19a35a30..bdad0c6f9618 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -782,9 +782,7 @@ where if bit_width == 0 { let min_delta = self.min_delta.as_i64()?; if min_delta == 0 { - for v in &mut buffer[read..read + batch_read] { - *v = self.last_value; - } + buffer[read..read + batch_read].fill(self.last_value); } else { // the c++ version multiplies min_delta by the iter index, but doing // wrapping_mul through T::T was a bit slower. this is still From 943316995b8f9ca0ca48b780b91b3c7ff743310b Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 3 Mar 2026 09:47:22 -0800 Subject: [PATCH 14/16] add another optimization for min_delta==0 --- parquet/src/encodings/decoding.rs | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index bdad0c6f9618..a04f495d8d77 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -779,8 +779,8 @@ where // `value[i] = (i + 1) * min_delta + last_value`. In both cases we remove the // dependence on the preceding value. // Kudos to @pitrou for the idea https://github.com/apache/arrow/pull/49296 + let min_delta = self.min_delta.as_i64()?; if bit_width == 0 { - let min_delta = self.min_delta.as_i64()?; if min_delta == 0 { buffer[read..read + batch_read].fill(self.last_value); } else { @@ -796,15 +796,21 @@ where self.last_value = buffer[read + batch_read - 1]; } } else { - for v in &mut buffer[read..read + batch_read] { - // It is OK for deltas to contain "overflowed" values after encoding, - // e.g. i64::MAX - i64::MIN, so we use `wrapping_add` to "overflow" again and - // restore original value. - *v = v - .wrapping_add(&self.min_delta) - .wrapping_add(&self.last_value); - - self.last_value = *v; + // It is OK for deltas to contain "overflowed" values after encoding, + // e.g. i64::MAX - i64::MIN, so we use `wrapping_add` to "overflow" again and + // restore original value. + if min_delta == 0 { + for v in &mut buffer[read..read + batch_read] { + *v = v.wrapping_add(&self.last_value); + self.last_value = *v; + } + } else { + for v in &mut buffer[read..read + batch_read] { + *v = v + .wrapping_add(&self.min_delta) + .wrapping_add(&self.last_value); + self.last_value = *v; + } } } From 8f7abd26fa1245c459c5e91b5e24e4ab5183a3c1 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 5 Mar 2026 13:57:59 -0800 Subject: [PATCH 15/16] accelerate skip as well --- parquet/src/encodings/decoding.rs | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index a04f495d8d77..3bf0e90f2187 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -873,12 +873,33 @@ where )); } - for v in &mut skip_buffer[0..skip_count] { - *v = v - .wrapping_add(&self.min_delta) - .wrapping_add(&self.last_value); + // see commentary in self.get() above regarding optimizations + let min_delta = self.min_delta.as_i64()?; + if bit_width == 0 { + // if min_delta == 0, there's nothing to do. self.last_value is unchanged + if min_delta != 0 { + let mut delta = self.min_delta; + for v in &mut skip_buffer[0..skip_count] { + *v = self.last_value.wrapping_add(&delta); + delta = delta.wrapping_add(&self.min_delta); + } - self.last_value = *v; + self.last_value = skip_buffer[skip_count - 1]; + } + } else if min_delta == 0 { + for v in &mut skip_buffer[0..skip_count] { + *v = v.wrapping_add(&self.last_value); + + self.last_value = *v; + } + } else { + for v in &mut skip_buffer[0..skip_count] { + *v = v + .wrapping_add(&self.min_delta) + .wrapping_add(&self.last_value); + + self.last_value = *v; + } } skip += mini_block_should_skip; From a232ec9df44daafaa9ead76217b68ac783b3e280 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 5 Mar 2026 14:40:03 -0800 Subject: [PATCH 16/16] add more tests --- parquet/src/encodings/decoding.rs | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 3bf0e90f2187..7da21e6dd091 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -1862,12 +1862,39 @@ mod tests { test_delta_bit_packed_decode::(vec![block_data]); } + #[test] + fn test_delta_bit_packed_int32_single_value_skip_large() { + let block_data = vec![3; 10240]; + test_skip::(block_data.clone(), Encoding::DELTA_BINARY_PACKED, 50); + test_skip::(block_data, Encoding::DELTA_BINARY_PACKED, 5000); + } + #[test] fn test_delta_bit_packed_int32_increasing_value_large() { let block_data = (0i32..10240).collect(); test_delta_bit_packed_decode::(vec![block_data]); } + #[test] + fn test_delta_bit_packed_int32_increasing_value_skip_large() { + let block_data = (0i32..10240).collect::>(); + test_skip::(block_data.clone(), Encoding::DELTA_BINARY_PACKED, 50); + test_skip::(block_data, Encoding::DELTA_BINARY_PACKED, 5000); + } + + #[test] + fn test_delta_bit_packed_int32_stepped_value_large() { + let block_data = (0i32..10240).map(|i| i / 2).collect(); + test_delta_bit_packed_decode::(vec![block_data]); + } + + #[test] + fn test_delta_bit_packed_int32_stepped_value_skip_large() { + let block_data = (0i32..10240).map(|i| i / 2).collect::>(); + test_skip::(block_data.clone(), Encoding::DELTA_BINARY_PACKED, 50); + test_skip::(block_data, Encoding::DELTA_BINARY_PACKED, 5000); + } + #[test] fn test_delta_bit_packed_int32_mixed_large() { // should be enough for 4 mini-blocks plus a little so we get some