From 30fa48cda0ff3aca9f28866836bf4e458cca267b Mon Sep 17 00:00:00 2001 From: atheendre130505 Date: Sat, 1 Nov 2025 17:39:11 +0530 Subject: [PATCH] Fix map_query_sql benchmark duplicate key error The build_keys() function was generating 1000 random keys from range 0..9999, which could result in duplicate keys due to the birthday paradox. The map() function requires unique keys, causing the benchmark to fail with: 'Execution("map key must be unique, duplicate key found: {key}")' This fix ensures all generated keys are unique by: - Using a HashSet to track seen keys - Only adding keys to the result if they haven't been seen before - Continuing to generate until exactly 1000 unique keys are produced Fixes #18421 --- datafusion/core/benches/map_query_sql.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/datafusion/core/benches/map_query_sql.rs b/datafusion/core/benches/map_query_sql.rs index 063b8e6c86bb..76b8e3ba7c3a 100644 --- a/datafusion/core/benches/map_query_sql.rs +++ b/datafusion/core/benches/map_query_sql.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use std::collections::HashSet; use std::sync::Arc; use arrow::array::{ArrayRef, Int32Array, RecordBatch}; @@ -33,8 +34,15 @@ mod data_utils; fn build_keys(rng: &mut ThreadRng) -> Vec { let mut keys = vec![]; - for _ in 0..1000 { - keys.push(rng.random_range(0..9999).to_string()); + let mut seen = HashSet::with_capacity(1000); + // Generate unique keys by tracking seen keys + while keys.len() < 1000 { + let key = rng.random_range(0..9999).to_string(); + if seen.insert(key.clone()) { + // Only push if it's a new unique key + keys.push(key); + } + // If key was already in set, skip it and generate another } keys }