From 1940b9f48296c873724a6683b5c8b332cb60abbc Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Thu, 30 Jan 2025 10:48:45 -0500 Subject: [PATCH 01/32] change for static schema --- src/event/format/json.rs | 11 ++++++++++- src/event/format/mod.rs | 35 +++++++++++++++++++++++++++++++--- src/handlers/http/logstream.rs | 2 +- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 5006be142..01bf8af62 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -178,7 +178,9 @@ fn valid_type(data_type: &DataType, value: &Value, schema_version: SchemaVersion DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => value.is_u64(), DataType::Float16 | DataType::Float32 => value.is_f64(), // All numbers can be cast as Float64 from schema version v1 - DataType::Float64 if schema_version == SchemaVersion::V1 => value.is_number(), + DataType::Float64 if schema_version == SchemaVersion::V1 => { + value.is_number() || is_parsable_as_number(value) + } DataType::Float64 if schema_version != SchemaVersion::V1 => value.is_f64(), DataType::Utf8 => value.is_string(), DataType::List(field) => { @@ -225,3 +227,10 @@ fn valid_type(data_type: &DataType, value: &Value, schema_version: SchemaVersion } } } + +pub fn is_parsable_as_number(value: &Value) -> bool { + let Value::String(s) = value else { + return false; + }; + s.parse::().is_ok() +} diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index c0a2ec323..3b4934c32 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -161,12 +161,20 @@ pub trait EventFormat: Sized { } for field in new_schema.fields() { let Some(storage_field) = storage_schema.get(field.name()) else { + println!("field not found in storage schema"); + println!("field: {:?}", field); return false; }; if field.name() != storage_field.name() { + println!("field name mismatch"); + println!("storage field: {:?}", storage_field); + println!("field: {:?}", field); return false; } if field.data_type() != storage_field.data_type() { + println!("data type mismatch"); + println!("storage field: {:?}", storage_field); + println!("field: {:?}", field); return false; } } @@ -242,8 +250,12 @@ pub fn update_field_type_in_schema( if let Some(log_records) = log_records { for log_record in log_records { - updated_schema = - override_data_type(updated_schema.clone(), log_record.clone(), schema_version); + updated_schema = override_data_type( + updated_schema.clone(), + log_record.clone(), + schema_version, + existing_schema, + ); } } @@ -276,6 +288,7 @@ pub fn override_data_type( inferred_schema: Arc, log_record: Value, schema_version: SchemaVersion, + existing_schema: Option<&HashMap>>, ) -> Arc { let Value::Object(map) = log_record else { return inferred_schema; @@ -304,6 +317,23 @@ pub fn override_data_type( true, ) } + (SchemaVersion::V1, Some(Value::String(s))) + if existing_schema.is_none() + || (existing_schema.is_some() + && existing_schema.unwrap().get(field_name).is_some() + && existing_schema + .unwrap() + .get(field_name) + .unwrap() + .data_type() + == &DataType::Float64 + && field.data_type() == &DataType::Utf8 + && s.parse::().is_ok()) => + { + // Update the field's data type to Float64 + Field::new(field_name, DataType::Float64, true) + } + // in V1 for new fields in json with inferred type number, cast as float64. (SchemaVersion::V1, Some(Value::Number(_))) if field.data_type().is_numeric() => { // Update the field's data type to Float64 @@ -314,6 +344,5 @@ pub fn override_data_type( } }) .collect(); - Arc::new(Schema::new(updated_schema)) } diff --git a/src/handlers/http/logstream.rs b/src/handlers/http/logstream.rs index 2aa544dbe..760edbb67 100644 --- a/src/handlers/http/logstream.rs +++ b/src/handlers/http/logstream.rs @@ -127,7 +127,7 @@ pub async fn detect_schema(Json(json): Json) -> Result Date: Thu, 30 Jan 2025 13:46:58 -0500 Subject: [PATCH 02/32] remove println --- src/event/format/mod.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index 3b4934c32..56d61e0a9 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -161,20 +161,12 @@ pub trait EventFormat: Sized { } for field in new_schema.fields() { let Some(storage_field) = storage_schema.get(field.name()) else { - println!("field not found in storage schema"); - println!("field: {:?}", field); return false; }; if field.name() != storage_field.name() { - println!("field name mismatch"); - println!("storage field: {:?}", storage_field); - println!("field: {:?}", field); return false; } if field.data_type() != storage_field.data_type() { - println!("data type mismatch"); - println!("storage field: {:?}", storage_field); - println!("field: {:?}", field); return false; } } From d7ffaaf27b30222be06c0a1a4f83cd8426c67da5 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Mon, 3 Feb 2025 03:29:47 -0500 Subject: [PATCH 03/32] changed compression to SNAPPY --- src/storage/object_storage.rs | 4 +++- src/storage/staging.rs | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index 4dc8603df..f547eaaba 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -21,7 +21,9 @@ use super::{ ObjectStoreFormat, Permisssion, StorageDir, StorageMetadata, }; use super::{ - LogStream, Owner, StreamType, ALERTS_ROOT_DIRECTORY, MANIFEST_FILE, PARSEABLE_METADATA_FILE_NAME, PARSEABLE_ROOT_DIRECTORY, SCHEMA_FILE_NAME, STREAM_METADATA_FILE_NAME, STREAM_ROOT_DIRECTORY + LogStream, Owner, StreamType, ALERTS_ROOT_DIRECTORY, MANIFEST_FILE, + PARSEABLE_METADATA_FILE_NAME, PARSEABLE_ROOT_DIRECTORY, SCHEMA_FILE_NAME, + STREAM_METADATA_FILE_NAME, STREAM_ROOT_DIRECTORY, }; use crate::alerts::AlertConfig; diff --git a/src/storage/staging.rs b/src/storage/staging.rs index aab6603d9..90a7a1207 100644 --- a/src/storage/staging.rs +++ b/src/storage/staging.rs @@ -321,7 +321,7 @@ pub fn parquet_writer_props( }); let mut props = WriterProperties::builder() .set_max_row_group_size(CONFIG.options.row_group_size) - .set_compression(CONFIG.options.parquet_compression.into()) + .set_compression(parquet::basic::Compression::SNAPPY) .set_column_encoding( ColumnPath::new(vec![time_partition_field]), Encoding::DELTA_BINARY_PACKED, From ca5299381bf980a64c6066e4160b0c1e1ce0c82c Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Mon, 3 Feb 2025 03:46:13 -0500 Subject: [PATCH 04/32] converted to Int64 --- src/event/format/json.rs | 11 ++++------- src/event/format/mod.rs | 10 +++++----- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 01bf8af62..9351eb10f 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -174,14 +174,11 @@ fn fields_mismatch(schema: &[Arc], body: &Value, schema_version: SchemaVe fn valid_type(data_type: &DataType, value: &Value, schema_version: SchemaVersion) -> bool { match data_type { DataType::Boolean => value.is_boolean(), - DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => value.is_i64(), + DataType::Int8 | DataType::Int16 | DataType::Int32 => value.is_i64(), DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => value.is_u64(), - DataType::Float16 | DataType::Float32 => value.is_f64(), + DataType::Float16 | DataType::Float32 | DataType::Float64 => value.is_f64(), // All numbers can be cast as Float64 from schema version v1 - DataType::Float64 if schema_version == SchemaVersion::V1 => { - value.is_number() || is_parsable_as_number(value) - } - DataType::Float64 if schema_version != SchemaVersion::V1 => value.is_f64(), + DataType::Int64 => value.is_i64() || is_parsable_as_number(value), DataType::Utf8 => value.is_string(), DataType::List(field) => { let data_type = field.data_type(); @@ -232,5 +229,5 @@ pub fn is_parsable_as_number(value: &Value) -> bool { let Value::String(s) = value else { return false; }; - s.parse::().is_ok() + s.parse::().is_ok() } diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index 56d61e0a9..21f4db6e0 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -318,18 +318,18 @@ pub fn override_data_type( .get(field_name) .unwrap() .data_type() - == &DataType::Float64 + == &DataType::Int64 && field.data_type() == &DataType::Utf8 - && s.parse::().is_ok()) => + && s.parse::().is_ok()) => { // Update the field's data type to Float64 - Field::new(field_name, DataType::Float64, true) + Field::new(field_name, DataType::Int64, true) } // in V1 for new fields in json with inferred type number, cast as float64. - (SchemaVersion::V1, Some(Value::Number(_))) if field.data_type().is_numeric() => { + (SchemaVersion::V1, Some(Value::Number(_))) if field.data_type().is_integer() => { // Update the field's data type to Float64 - Field::new(field_name, DataType::Float64, true) + Field::new(field_name, DataType::Int64, true) } // Return the original field if no update is needed _ => Field::new(field_name, field.data_type().clone(), true), From d7c85a996243af9b4ba0da3d2894ca90a138b5b4 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Mon, 3 Feb 2025 04:17:54 -0500 Subject: [PATCH 05/32] reverted compression to lz4raw --- src/storage/staging.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/storage/staging.rs b/src/storage/staging.rs index 90a7a1207..aab6603d9 100644 --- a/src/storage/staging.rs +++ b/src/storage/staging.rs @@ -321,7 +321,7 @@ pub fn parquet_writer_props( }); let mut props = WriterProperties::builder() .set_max_row_group_size(CONFIG.options.row_group_size) - .set_compression(parquet::basic::Compression::SNAPPY) + .set_compression(CONFIG.options.parquet_compression.into()) .set_column_encoding( ColumnPath::new(vec![time_partition_field]), Encoding::DELTA_BINARY_PACKED, From c10d4fba8d3577153f663faf04324419e95d2288 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Mon, 3 Feb 2025 04:38:13 -0500 Subject: [PATCH 06/32] url encode --- src/storage/staging.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/storage/staging.rs b/src/storage/staging.rs index aab6603d9..899eafda8 100644 --- a/src/storage/staging.rs +++ b/src/storage/staging.rs @@ -37,7 +37,7 @@ use parquet::{ arrow::ArrowWriter, basic::Encoding, errors::ParquetError, - file::properties::{WriterProperties, WriterPropertiesBuilder}, + file::properties::{EnabledStatistics, WriterProperties, WriterPropertiesBuilder}, format::SortingColumn, schema::types::ColumnPath, }; @@ -340,6 +340,11 @@ pub fn parquet_writer_props( } props = props.set_sorting_columns(Some(sorting_column_vec)); + let url_column_path = ColumnPath::new(vec!["URL".to_string()]); + let url_encoding = Encoding::DELTA_BYTE_ARRAY; + props = props.set_column_encoding(url_column_path, url_encoding); + props = props.set_statistics_enabled(EnabledStatistics::Chunk); + props } From 4087877a92c8d8aa9f048acfe382db707dc6c83b Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Mon, 3 Feb 2025 04:54:34 -0500 Subject: [PATCH 07/32] change statistics to page --- src/storage/staging.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/storage/staging.rs b/src/storage/staging.rs index 899eafda8..e360264fd 100644 --- a/src/storage/staging.rs +++ b/src/storage/staging.rs @@ -343,7 +343,7 @@ pub fn parquet_writer_props( let url_column_path = ColumnPath::new(vec!["URL".to_string()]); let url_encoding = Encoding::DELTA_BYTE_ARRAY; props = props.set_column_encoding(url_column_path, url_encoding); - props = props.set_statistics_enabled(EnabledStatistics::Chunk); + props = props.set_statistics_enabled(EnabledStatistics::Page); props } From 799959b5de208b5b413f98787efc684f34e7288a Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Mon, 3 Feb 2025 21:51:50 -0500 Subject: [PATCH 08/32] page size to 20mb, add data type for Date --- src/event/format/json.rs | 8 ++++++++ src/event/format/mod.rs | 12 +++++++++++- src/static_schema.rs | 1 + src/storage/staging.rs | 3 ++- 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 9351eb10f..1ff14085a 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -217,6 +217,7 @@ fn valid_type(data_type: &DataType, value: &Value, schema_version: SchemaVersion false } } + DataType::Date32 => value.is_string() && is_parsable_as_date(value), DataType::Timestamp(_, _) => value.is_string() || value.is_number(), _ => { error!("Unsupported datatype {:?}, value {:?}", data_type, value); @@ -231,3 +232,10 @@ pub fn is_parsable_as_number(value: &Value) -> bool { }; s.parse::().is_ok() } + +pub fn is_parsable_as_date(value: &Value) -> bool { + let Value::String(s) = value else { + return false; + }; + chrono::NaiveDate::parse_from_str(s, "%Y-%m-%d").is_ok() +} diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index 21f4db6e0..32af458d2 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -26,7 +26,7 @@ use std::{ use anyhow::{anyhow, Error as AnyError}; use arrow_array::RecordBatch; use arrow_schema::{DataType, Field, Schema, TimeUnit}; -use chrono::DateTime; +use chrono::{DateTime, NaiveDate}; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -325,6 +325,16 @@ pub fn override_data_type( // Update the field's data type to Float64 Field::new(field_name, DataType::Int64, true) } + (SchemaVersion::V1, Some(Value::String(s))) + if TIME_FIELD_NAME_PARTS + .iter() + .any(|part| field_name.to_lowercase().contains(part)) + && field.data_type() == &DataType::Utf8 + && NaiveDate::parse_from_str(s, "%Y-%m-%d").is_ok() => + { + // Update the field's data type to Timestamp + Field::new(field_name, DataType::Date32, true) + } // in V1 for new fields in json with inferred type number, cast as float64. (SchemaVersion::V1, Some(Value::Number(_))) if field.data_type().is_integer() => { diff --git a/src/static_schema.rs b/src/static_schema.rs index 6717175d0..6db0720ba 100644 --- a/src/static_schema.rs +++ b/src/static_schema.rs @@ -102,6 +102,7 @@ pub fn convert_static_schema_to_arrow_schema( "double" | "float" => DataType::Float64, "boolean" => DataType::Boolean, "string" => DataType::Utf8, + "date" => DataType::Date32, "datetime" => DataType::Timestamp(TimeUnit::Millisecond, None), "string_list" => { DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))) diff --git a/src/storage/staging.rs b/src/storage/staging.rs index e360264fd..9630de834 100644 --- a/src/storage/staging.rs +++ b/src/storage/staging.rs @@ -325,7 +325,8 @@ pub fn parquet_writer_props( .set_column_encoding( ColumnPath::new(vec![time_partition_field]), Encoding::DELTA_BINARY_PACKED, - ); + ) + .set_data_page_size_limit(20 * 1024 * 1024); for (field, index) in custom_partition_fields { let field = ColumnPath::new(vec![field]); From dd0ef2bccd4600a8ab74f7f95ad05d96b840e9e9 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Tue, 4 Feb 2025 13:03:56 -0500 Subject: [PATCH 09/32] reverted Utf8View change --- src/query/mod.rs | 6 +++--- src/query/stream_schema_provider.rs | 6 +++--- src/storage/staging.rs | 3 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/query/mod.rs b/src/query/mod.rs index 5006077a0..8b18ba675 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -494,7 +494,7 @@ fn transform( ) -> Transformed { plan.transform(&|plan| match plan { LogicalPlan::TableScan(table) => { - let mut new_filters = vec![]; + let new_filters = vec![]; if !table_contains_any_time_filters(&table, time_partition) { let mut _start_time_filter: Expr; let mut _end_time_filter: Expr; @@ -529,8 +529,8 @@ fn transform( } } - new_filters.push(_start_time_filter); - new_filters.push(_end_time_filter); + //new_filters.push(_start_time_filter); + //new_filters.push(_end_time_filter); } let new_filter = new_filters.into_iter().reduce(and); if let Some(new_filter) = new_filter { diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index fd0e88c57..eff01cd49 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -435,9 +435,9 @@ impl TableProvider for StandardTableProvider { .map_err(|err| DataFusionError::Plan(err.to_string()))?; let time_partition = object_store_format.time_partition; let mut time_filters = extract_primary_filter(filters, &time_partition); - if time_filters.is_empty() { - return Err(DataFusionError::Plan("potentially unbounded query on time range. Table scanning requires atleast one time bound".to_string())); - } + // if time_filters.is_empty() { + // return Err(DataFusionError::Plan("potentially unbounded query on time range. Table scanning requires atleast one time bound".to_string())); + // } if include_now(filters, &time_partition) { if let Some(records) = diff --git a/src/storage/staging.rs b/src/storage/staging.rs index 9630de834..a06d0754b 100644 --- a/src/storage/staging.rs +++ b/src/storage/staging.rs @@ -326,7 +326,8 @@ pub fn parquet_writer_props( ColumnPath::new(vec![time_partition_field]), Encoding::DELTA_BINARY_PACKED, ) - .set_data_page_size_limit(20 * 1024 * 1024); + .set_data_page_size_limit(20 * 1024 * 1024) + .set_data_page_row_count_limit(100000); for (field, index) in custom_partition_fields { let field = ColumnPath::new(vec![field]); From 568ff0aafbc9d32dfca5998c05ce33e443381914 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Tue, 4 Feb 2025 13:44:31 -0500 Subject: [PATCH 10/32] removed some options from query config --- src/query/mod.rs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/query/mod.rs b/src/query/mod.rs index 8b18ba675..2f2e12d72 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -89,28 +89,29 @@ impl Query { let mut config = SessionConfig::default() .with_parquet_pruning(true) - .with_prefer_existing_sort(true) + //.with_prefer_existing_sort(true) .with_round_robin_repartition(true); // For more details refer https://datafusion.apache.org/user-guide/configs.html // Reduce the number of rows read (if possible) - config.options_mut().execution.parquet.enable_page_index = true; + //config.options_mut().execution.parquet.enable_page_index = true; // Pushdown filters allows DF to push the filters as far down in the plan as possible // and thus, reducing the number of rows decoded config.options_mut().execution.parquet.pushdown_filters = true; // Reorder filters allows DF to decide the order of filters minimizing the cost of filter evaluation - config.options_mut().execution.parquet.reorder_filters = true; + // config.options_mut().execution.parquet.reorder_filters = true; // Enable StringViewArray // https://www.influxdata.com/blog/faster-queries-with-stringview-part-one-influxdb/ - config - .options_mut() - .execution - .parquet - .schema_force_view_types = true; + // config + // .options_mut() + // .execution + // .parquet + // .schema_force_view_types = true; + config.options_mut().execution.parquet.binary_as_string = true; let state = SessionStateBuilder::new() .with_default_features() From 02073380268b05b543ec0a08528244a47d96dc08 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Tue, 4 Feb 2025 13:55:27 -0500 Subject: [PATCH 11/32] Utf8View changes --- src/catalog/column.rs | 16 +++++----- src/event/format/json.rs | 47 ++++++++++++++++++++++++++--- src/event/format/mod.rs | 8 ++--- src/query/stream_schema_provider.rs | 4 +-- src/static_schema.rs | 4 +-- 5 files changed, 58 insertions(+), 21 deletions(-) diff --git a/src/catalog/column.rs b/src/catalog/column.rs index d5db2950d..3b429c6fc 100644 --- a/src/catalog/column.rs +++ b/src/catalog/column.rs @@ -41,7 +41,7 @@ pub struct Int64Type { } #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -pub struct Utf8Type { +pub struct Utf8ViewType { pub min: String, pub max: String, } @@ -54,7 +54,7 @@ pub enum TypedStatistics { Bool(BoolType), Int(Int64Type), Float(Float64Type), - String(Utf8Type), + String(Utf8ViewType), } impl TypedStatistics { @@ -79,7 +79,7 @@ impl TypedStatistics { }) } (TypedStatistics::String(this), TypedStatistics::String(other)) => { - TypedStatistics::String(Utf8Type { + TypedStatistics::String(Utf8ViewType { min: min(this.min, other.min), max: max(this.max, other.max), }) @@ -110,9 +110,9 @@ impl TypedStatistics { ScalarValue::Float64(Some(stats.min)), ScalarValue::Float64(Some(stats.max)), ), - (TypedStatistics::String(stats), DataType::Utf8) => ( - ScalarValue::Utf8(Some(stats.min)), - ScalarValue::Utf8(Some(stats.max)), + (TypedStatistics::String(stats), DataType::Utf8View) => ( + ScalarValue::Utf8View(Some(stats.min)), + ScalarValue::Utf8View(Some(stats.max)), ), _ => { return None; @@ -167,7 +167,7 @@ impl TryFrom<&Statistics> for TypedStatistics { min: *stats.min_opt().expect("Float64 stats min not set"), max: *stats.max_opt().expect("Float64 stats max not set"), }), - Statistics::ByteArray(stats) => TypedStatistics::String(Utf8Type { + Statistics::ByteArray(stats) => TypedStatistics::String(Utf8ViewType { min: stats .min_opt() .expect("Utf8 stats min not set") @@ -179,7 +179,7 @@ impl TryFrom<&Statistics> for TypedStatistics { .as_utf8()? .to_owned(), }), - Statistics::FixedLenByteArray(stats) => TypedStatistics::String(Utf8Type { + Statistics::FixedLenByteArray(stats) => TypedStatistics::String(Utf8ViewType { min: stats .min_opt() .expect("Utf8 stats min not set") diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 1ff14085a..657cbdfc0 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -20,7 +20,7 @@ #![allow(deprecated)] use anyhow::anyhow; -use arrow_array::RecordBatch; +use arrow_array::{RecordBatch, StringArray, StringViewArray}; use arrow_json::reader::{infer_json_schema_from_iterator, ReaderBuilder}; use arrow_schema::{DataType, Field, Fields, Schema}; use datafusion::arrow::util::bit_util::round_upto_multiple_of_64; @@ -107,15 +107,52 @@ impl EventFormat for Event { // Convert the Data type (defined above) to arrow record batch fn decode(data: Self::Data, schema: Arc) -> Result { + // First create a schema with Utf8 instead of Utf8View + let temp_schema = Schema::new( + schema + .fields() + .iter() + .map(|field| { + if matches!(field.data_type(), DataType::Utf8View) { + Arc::new(Field::new(field.name(), DataType::Utf8, field.is_nullable())) + } else { + field.clone() + } + }) + .collect::>(), + ); + let array_capacity = round_upto_multiple_of_64(data.len()); - let mut reader = ReaderBuilder::new(schema) + let mut reader = ReaderBuilder::new(Arc::new(temp_schema)) .with_batch_size(array_capacity) .with_coerce_primitive(false) + .with_strict_mode(false) .build_decoder()?; - + reader.serialize(&data)?; + match reader.flush() { - Ok(Some(recordbatch)) => Ok(recordbatch), + Ok(Some(temp_batch)) => { + // Convert Utf8 arrays to Utf8View arrays where needed + let new_columns: Vec> = temp_batch + .columns() + .iter() + .zip(schema.fields()) + .map(|(col, field)| { + if matches!(field.data_type(), DataType::Utf8View) { + let string_array = col + .as_any() + .downcast_ref::() + .expect("Expected StringArray"); + Arc::new(StringViewArray::from(string_array.iter().map(|s| s.map(|s| s.to_string())).collect::>())) + } else { + col.clone() + } + }) + .collect(); + + Ok(RecordBatch::try_new(schema, new_columns)?) + } Err(err) => Err(anyhow!("Failed to create recordbatch due to {:?}", err)), Ok(None) => unreachable!("all records are added to one rb"), } @@ -179,7 +216,7 @@ fn valid_type(data_type: &DataType, value: &Value, schema_version: SchemaVersion DataType::Float16 | DataType::Float32 | DataType::Float64 => value.is_f64(), // All numbers can be cast as Float64 from schema version v1 DataType::Int64 => value.is_i64() || is_parsable_as_number(value), - DataType::Utf8 => value.is_string(), + DataType::Utf8View => value.is_string(), DataType::List(field) => { let data_type = field.data_type(); if let Value::Array(arr) = value { diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index 32af458d2..e77685315 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -262,7 +262,7 @@ pub fn update_field_type_in_schema( // time_partition field not present in existing schema with string type data as timestamp if field.name() == time_partition && !existing_field_names.contains(field.name()) - && field.data_type() == &DataType::Utf8 + && field.data_type() == &DataType::Utf8View { let new_data_type = DataType::Timestamp(TimeUnit::Millisecond, None); Field::new(field.name(), new_data_type, true) @@ -298,7 +298,7 @@ pub fn override_data_type( if TIME_FIELD_NAME_PARTS .iter() .any(|part| field_name.to_lowercase().contains(part)) - && field.data_type() == &DataType::Utf8 + && field.data_type() == &DataType::Utf8View && (DateTime::parse_from_rfc3339(s).is_ok() || DateTime::parse_from_rfc2822(s).is_ok()) => { @@ -319,7 +319,7 @@ pub fn override_data_type( .unwrap() .data_type() == &DataType::Int64 - && field.data_type() == &DataType::Utf8 + && field.data_type() == &DataType::Utf8View && s.parse::().is_ok()) => { // Update the field's data type to Float64 @@ -329,7 +329,7 @@ pub fn override_data_type( if TIME_FIELD_NAME_PARTS .iter() .any(|part| field_name.to_lowercase().contains(part)) - && field.data_type() == &DataType::Utf8 + && field.data_type() == &DataType::Utf8View && NaiveDate::parse_from_str(s, "%Y-%m-%d").is_ok() => { // Update the field's data type to Timestamp diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index eff01cd49..05a9a666b 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -796,7 +796,7 @@ fn extract_timestamp_bound( binexpr.op, DateTime::from_timestamp_nanos(*value).naive_utc(), )), - ScalarValue::Utf8(Some(str_value)) if is_time_partition => { + ScalarValue::Utf8View(Some(str_value)) if is_time_partition => { Some((binexpr.op, str_value.parse::().unwrap())) } _ => None, @@ -919,7 +919,7 @@ fn cast_or_none(scalar: &ScalarValue) -> Option> { ScalarValue::UInt16(val) => val.map(|val| CastRes::Int(val as i64)), ScalarValue::UInt32(val) => val.map(|val| CastRes::Int(val as i64)), ScalarValue::UInt64(val) => val.map(|val| CastRes::Int(val as i64)), - ScalarValue::Utf8(val) => val.as_ref().map(|val| CastRes::String(val)), + ScalarValue::Utf8View(val) => val.as_ref().map(|val| CastRes::String(val)), ScalarValue::TimestampMillisecond(val, _) => val.map(CastRes::Int), _ => None, } diff --git a/src/static_schema.rs b/src/static_schema.rs index 6db0720ba..56f6c5542 100644 --- a/src/static_schema.rs +++ b/src/static_schema.rs @@ -101,11 +101,11 @@ pub fn convert_static_schema_to_arrow_schema( "int" => DataType::Int64, "double" | "float" => DataType::Float64, "boolean" => DataType::Boolean, - "string" => DataType::Utf8, + "string" => DataType::Utf8View, "date" => DataType::Date32, "datetime" => DataType::Timestamp(TimeUnit::Millisecond, None), "string_list" => { - DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))) + DataType::List(Arc::new(Field::new("item", DataType::Utf8View, true))) } "int_list" => { DataType::List(Arc::new(Field::new("item", DataType::Int64, true))) From e471746d5d331951a7b5a52b808c5eae4de8bf5f Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Thu, 6 Feb 2025 03:54:24 -0500 Subject: [PATCH 12/32] register parquets in tables --- Cargo.lock | 24 ++++++++++ Cargo.toml | 1 + src/handlers/http/query.rs | 9 +++- src/query/mod.rs | 94 +++++++++++++++++++++++++++++++++++++- 4 files changed, 125 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 908219d4c..e38a37a07 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2213,6 +2213,15 @@ version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "heck" version = "0.4.1" @@ -3283,6 +3292,7 @@ dependencies = [ "sha1_smol", "sha2", "static-files", + "structopt-derive", "sysinfo", "thiserror 2.0.9", "tokio", @@ -4463,6 +4473,19 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "structopt-derive" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" +dependencies = [ + "heck 0.3.3", + "proc-macro-error", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "strum" version = "0.26.2" @@ -4495,6 +4518,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" dependencies = [ "proc-macro2", + "quote", "unicode-ident", ] diff --git a/Cargo.toml b/Cargo.toml index 97832b7f3..f92e51d8e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -116,6 +116,7 @@ static-files = "0.2" thiserror = "2.0.0" ulid = { version = "1.0", features = ["serde"] } xxhash-rust = { version = "0.8", features = ["xxh3"] } +structopt-derive = "0.4.18" [build-dependencies] cargo_toml = "0.20.1" diff --git a/src/handlers/http/query.rs b/src/handlers/http/query.rs index 5cfc791ad..f090eb93a 100644 --- a/src/handlers/http/query.rs +++ b/src/handlers/http/query.rs @@ -41,7 +41,7 @@ use crate::event::commit_schema; use crate::metrics::QUERY_EXECUTE_TIME; use crate::option::{Mode, CONFIG}; use crate::query::error::ExecuteError; -use crate::query::{CountsRequest, CountsResponse, Query as LogicalQuery}; +use crate::query::{run, CountsRequest, CountsResponse, Query as LogicalQuery}; use crate::query::{TableScanVisitor, QUERY_SESSION}; use crate::rbac::Users; use crate::response::QueryResponse; @@ -69,6 +69,9 @@ pub struct Query { } pub async fn query(req: HttpRequest, query_request: Query) -> Result { + + let _ = run().await; + println!("benchmarking complete"); let session_state = QUERY_SESSION.state(); let raw_logical_plan = match session_state .create_logical_plan(&query_request.query) @@ -113,6 +116,8 @@ pub async fn query(req: HttpRequest, query_request: Query) -> Result Result) -> SessionContext { let runtime_config = storage @@ -614,6 +620,92 @@ pub fn flatten_objects_for_count(objects: Vec) -> Vec { } } +struct AllQueries { + queries: Vec, +} + +impl AllQueries { + fn try_new(path: &Path) -> Result { + // ClickBench has all queries in a single file identified by line number + let all_queries = std::fs::read_to_string(path) + .map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?; + Ok(Self { + queries: all_queries.lines().map(|s| s.to_string()).collect(), + }) + } + + /// Returns the text of query `query_id` + fn get_query(&self, query_id: usize) -> Result<&str> { + self.queries + .get(query_id) + .ok_or_else(|| { + let min_id = self.min_query_id(); + let max_id = self.max_query_id(); + exec_datafusion_err!( + "Invalid query id {query_id}. Must be between {min_id} and {max_id}" + ) + }) + .map(|s| s.as_str()) + } + + fn min_query_id(&self) -> usize { + 0 + } + + fn max_query_id(&self) -> usize { + self.queries.len() - 1 + } +} + +pub async fn run() -> Result<()> { + println!("Running benchmarks"); + let queries_path: PathBuf = ["/home", "clickbench", "queries.sql"].iter().collect(); + let queries = AllQueries::try_new(queries_path.as_path())?; + println!("queries loaded"); + println!("query no. 1: {:?}", queries.get_query(1)?); + let query_range = queries.min_query_id()..=queries.max_query_id(); + + // configure parquet options + let mut config = SessionConfig::new() + .with_target_partitions(num_cpus::get()); + config.options_mut().execution.parquet.binary_as_string = true; + + let ctx = SessionContext::new_with_config(config); + register_hits(&ctx).await?; + + for query_id in query_range { + let sql = queries.get_query(query_id)?; + println!("Q{query_id}: {sql}"); + + let start = Instant::now(); + let _ = ctx.sql(sql).await?.collect().await?; + let elapsed = start.elapsed().as_secs_f64(); + println!("Q{query_id} took {elapsed} seconds"); + + } + Ok(()) +} + +/// Registers the `hits.parquet` as a table named `hits` +async fn register_hits(ctx: &SessionContext) -> Result<()> { + let mut options: ParquetReadOptions<'_> = Default::default(); + options.table_partition_cols = vec!["date", "hour", "minute"] + .iter() + .map(|s| (s.to_string(), DataType::Utf8)) + .collect(); + let path: PathBuf = ["/home", "ubuntu", "parseable", "hits"].iter().collect(); + let path = path.as_os_str().to_str().unwrap(); + println!("Registering 'hits' as {path}"); + ctx.register_parquet("hits", path, options) + .await + .map_err(|e| { + DataFusionError::Context( + format!("Registering 'hits' as {path}"), + Box::new(e), + ) + }) +} + pub mod error { use crate::{metadata::error::stream_info::MetadataError, storage::ObjectStorageError}; use datafusion::error::DataFusionError; From 737236f504d4f8b24af744def92518ca131c0d08 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Thu, 6 Feb 2025 04:33:28 -0500 Subject: [PATCH 13/32] added configs to improve perf --- src/query/mod.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/query/mod.rs b/src/query/mod.rs index 50e9d6f9a..e0e9c0e2d 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -659,7 +659,7 @@ impl AllQueries { pub async fn run() -> Result<()> { println!("Running benchmarks"); - let queries_path: PathBuf = ["/home", "clickbench", "queries.sql"].iter().collect(); + let queries_path: PathBuf = ["/home","ubuntu", "clickbench", "queries.sql"].iter().collect(); let queries = AllQueries::try_new(queries_path.as_path())?; println!("queries loaded"); println!("query no. 1: {:?}", queries.get_query(1)?); @@ -667,9 +667,11 @@ pub async fn run() -> Result<()> { // configure parquet options let mut config = SessionConfig::new() - .with_target_partitions(num_cpus::get()); + .with_parquet_pruning(true) + .with_target_partitions(num_cpus::get()) + .with_round_robin_repartition(true); config.options_mut().execution.parquet.binary_as_string = true; - + config.options_mut().execution.parquet.pushdown_filters = true; let ctx = SessionContext::new_with_config(config); register_hits(&ctx).await?; @@ -693,7 +695,8 @@ async fn register_hits(ctx: &SessionContext) -> Result<()> { .iter() .map(|s| (s.to_string(), DataType::Utf8)) .collect(); - let path: PathBuf = ["/home", "ubuntu", "parseable", "hits"].iter().collect(); + options.parquet_pruning = Some(true); + let path: PathBuf = ["/home", "ubuntu", "parseable", "data", "hits"].iter().collect(); let path = path.as_os_str().to_str().unwrap(); println!("Registering 'hits' as {path}"); ctx.register_parquet("hits", path, options) From 87c1c511df2c13b406c8c1ed2113838de8629620 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Thu, 6 Feb 2025 13:02:35 -0500 Subject: [PATCH 14/32] config updated --- src/query/mod.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/query/mod.rs b/src/query/mod.rs index e0e9c0e2d..fbdf1d0c4 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -669,9 +669,13 @@ pub async fn run() -> Result<()> { let mut config = SessionConfig::new() .with_parquet_pruning(true) .with_target_partitions(num_cpus::get()) - .with_round_robin_repartition(true); + .with_coalesce_batches(true) + .with_collect_statistics(true) + .with_parquet_page_index_pruning(true); config.options_mut().execution.parquet.binary_as_string = true; config.options_mut().execution.parquet.pushdown_filters = true; + config.options_mut().execution.parquet.reorder_filters = true; + config.options_mut().execution.use_row_number_estimates_to_optimize_partitioning = true; let ctx = SessionContext::new_with_config(config); register_hits(&ctx).await?; @@ -695,7 +699,8 @@ async fn register_hits(ctx: &SessionContext) -> Result<()> { .iter() .map(|s| (s.to_string(), DataType::Utf8)) .collect(); - options.parquet_pruning = Some(true); + let schema = STREAM_INFO.schema("hits").unwrap(); + options.schema = Some(&schema); let path: PathBuf = ["/home", "ubuntu", "parseable", "data", "hits"].iter().collect(); let path = path.as_os_str().to_str().unwrap(); println!("Registering 'hits' as {path}"); From 8c30e300de8fe93167a73e70768a3b2cf9f5063e Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Wed, 12 Feb 2025 07:01:40 -0500 Subject: [PATCH 15/32] add runtime config and logical plan --- Cargo.lock | 2691 ++++++++++++++-------- Cargo.toml | 39 +- src/handlers/airplane.rs | 696 +++--- src/handlers/http/modal/ingest_server.rs | 3 +- src/handlers/http/modal/query_server.rs | 3 +- src/handlers/http/modal/server.rs | 2 +- src/query/catalog.rs | 212 ++ src/query/functions.rs | 461 ++++ src/query/mod.rs | 45 +- src/query/object_storage.rs | 456 ++++ src/query/stream_schema_provider.rs | 9 + src/utils/arrow/flight.rs | 322 +-- 12 files changed, 3401 insertions(+), 1538 deletions(-) create mode 100644 src/query/catalog.rs create mode 100644 src/query/functions.rs create mode 100644 src/query/object_storage.rs diff --git a/Cargo.lock b/Cargo.lock index e38a37a07..e4428d9ae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,7 +8,7 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f7b0a21988c1bf877cf4759ef5ddaac04c1c9fe808c9142ecb78ba97d97a28a" dependencies = [ - "bitflags 2.5.0", + "bitflags 2.8.0", "bytes", "futures-core", "futures-sink", @@ -46,8 +46,8 @@ dependencies = [ "actix-tls", "actix-utils", "ahash", - "base64 0.22.0", - "bitflags 2.5.0", + "base64 0.22.1", + "bitflags 2.8.0", "brotli 6.0.0", "bytes", "bytestring", @@ -65,7 +65,7 @@ dependencies = [ "mime", "percent-encoding", "pin-project-lite", - "rand", + "rand 0.8.5", "sha1", "smallvec", "tokio", @@ -81,7 +81,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e01ed3140b2f8d422c68afa1ed2e85d996ea619c988ac834d255db32138655cb" dependencies = [ "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] @@ -101,9 +101,9 @@ dependencies = [ [[package]] name = "actix-rt" -version = "2.9.0" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28f32d40287d3f402ae0028a9d54bef51af15c8769492826a69d28f81893151d" +checksum = "24eda4e2a6e042aa4e55ac438a2ae052d3b5da0ecf83d7411e1a368946925208" dependencies = [ "futures-core", "tokio", @@ -111,16 +111,16 @@ dependencies = [ [[package]] name = "actix-server" -version = "2.3.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3eb13e7eef0423ea6eab0e59f6c72e7cb46d33691ad56a726b3cd07ddec2c2d4" +checksum = "7ca2549781d8dd6d75c40cf6b6051260a2cc2f3c62343d761a969a0640646894" dependencies = [ "actix-rt", "actix-service", "actix-utils", "futures-core", "futures-util", - "mio 0.8.11", + "mio", "socket2", "tokio", "tracing", @@ -218,18 +218,18 @@ dependencies = [ "actix-router", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] name = "actix-web-httpauth" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d613edf08a42ccc6864c941d30fe14e1b676a77d16f1dbadc1174d065a0a775" +checksum = "456348ed9dcd72a13a1f4a660449fafdecee9ac8205552e286809eb5b0b29bd3" dependencies = [ "actix-utils", "actix-web", - "base64 0.21.7", + "base64 0.22.1", "futures-core", "futures-util", "log", @@ -248,7 +248,7 @@ dependencies = [ "pin-project", "prometheus", "quanta", - "thiserror 1.0.64", + "thiserror 1.0.69", ] [[package]] @@ -265,19 +265,13 @@ dependencies = [ [[package]] name = "addr2line" -version = "0.21.0" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" dependencies = [ "gimli", ] -[[package]] -name = "adler" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" - [[package]] name = "adler2" version = "2.0.0" @@ -292,10 +286,10 @@ checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "const-random", - "getrandom", + "getrandom 0.2.15", "once_cell", "version_check", - "zerocopy", + "zerocopy 0.7.35", ] [[package]] @@ -324,9 +318,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.18" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" [[package]] name = "android-tzdata" @@ -345,66 +339,68 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.13" +version = "0.6.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", + "is_terminal_polyfill", "utf8parse", ] [[package]] name = "anstyle" -version = "1.0.6" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" [[package]] name = "anstyle-parse" -version = "0.2.3" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.0.2" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] name = "anstyle-wincon" -version = "3.0.2" +version = "3.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" +checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" dependencies = [ "anstyle", - "windows-sys 0.52.0", + "once_cell", + "windows-sys 0.59.0", ] [[package]] name = "anyhow" -version = "1.0.82" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f538837af36e6f6a9be0faa67f9a314f8119e4e4b5867c6ab40ed60360142519" +checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" dependencies = [ "backtrace", ] [[package]] name = "arbitrary" -version = "1.3.2" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" +checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223" dependencies = [ "derive_arbitrary", ] @@ -423,21 +419,21 @@ dependencies = [ [[package]] name = "arrayref" -version = "0.3.7" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" [[package]] name = "arrayvec" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91839b07e474b3995035fd8ac33ee54f9c9ccbbb1ea33d9909c71bffdf1259d" +checksum = "6422e12ac345a0678d7a17e316238e3a40547ae7f92052b77bd86d5e0239f3fc" dependencies = [ "arrow-arith", "arrow-array", @@ -456,24 +452,23 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "855c57c4efd26722b044dcd3e348252560e3e0333087fb9f6479dc0bf744054f" +checksum = "23cf34bb1f48c41d3475927bcc7be498665b8e80b379b88f62a840337f8b8248" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "half", "num", ] [[package]] name = "arrow-array" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd03279cea46569acf9295f6224fbc370c5df184b4d2ecfe97ccb131d5615a7f" +checksum = "fb4a06d507f54b70a277be22a127c8ffe0cec6cd98c0ad8a48e77779bbda8223" dependencies = [ "ahash", "arrow-buffer", @@ -488,9 +483,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e4a9b9b1d6d7117f6138e13bc4dd5daa7f94e671b70e8c9c4dc37b4f5ecfc16" +checksum = "d69d326d5ad1cb82dcefa9ede3fee8fdca98f9982756b16f9cb142f4aa6edc89" dependencies = [ "bytes", "half", @@ -499,9 +494,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc70e39916e60c5b7af7a8e2719e3ae589326039e1e863675a008bee5ffe90fd" +checksum = "626e65bd42636a84a238bed49d09c8777e3d825bf81f5087a70111c2831d9870" dependencies = [ "arrow-array", "arrow-buffer", @@ -509,7 +504,7 @@ dependencies = [ "arrow-schema", "arrow-select", "atoi", - "base64 0.22.0", + "base64 0.22.1", "chrono", "comfy-table", "half", @@ -520,28 +515,25 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "789b2af43c1049b03a8d088ff6b2257cdcea1756cd76b174b1f2600356771b97" +checksum = "71c8f959f7a1389b1dbd883cdcd37c3ed12475329c111912f7f69dad8195d8c6" dependencies = [ "arrow-array", - "arrow-buffer", "arrow-cast", - "arrow-data", "arrow-schema", "chrono", "csv", "csv-core", "lazy_static", - "lexical-core", "regex", ] [[package]] name = "arrow-data" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4e75edf21ffd53744a9b8e3ed11101f610e7ceb1a29860432824f1834a1f623" +checksum = "1858e7c7d01c44cf71c21a85534fd1a54501e8d60d1195d0d6fbcc00f4b10754" dependencies = [ "arrow-buffer", "arrow-schema", @@ -551,34 +543,39 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c09b331887a526f203f2123444792aee924632bd08b9940435070901075832e" +checksum = "a9b3aaba47ed4b6146563c8b79ad0f7aa283f794cde0c057c656291b81196746" dependencies = [ + "arrow-arith", "arrow-array", "arrow-buffer", "arrow-cast", + "arrow-data", "arrow-ipc", + "arrow-ord", + "arrow-row", "arrow-schema", - "base64 0.22.0", + "arrow-select", + "arrow-string", + "base64 0.22.1", "bytes", "futures", + "once_cell", "paste", "prost", "prost-types", - "tokio", "tonic", ] [[package]] name = "arrow-ipc" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d186a909dece9160bf8312f5124d797884f608ef5435a36d9d608e0b2a9bcbf8" +checksum = "a6bb3f727f049884c7603f0364bc9315363f356b59e9f605ea76541847e06a1e" dependencies = [ "arrow-array", "arrow-buffer", - "arrow-cast", "arrow-data", "arrow-schema", "flatbuffers", @@ -588,9 +585,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b66ff2fedc1222942d0bd2fd391cb14a85baa3857be95c9373179bd616753b85" +checksum = "35de94f165ed8830aede72c35f238763794f0d49c69d30c44d49c9834267ff8c" dependencies = [ "arrow-array", "arrow-buffer", @@ -599,7 +596,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.5.0", + "indexmap 2.7.1", "lexical-core", "num", "serde", @@ -608,26 +605,23 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ece7b5bc1180e6d82d1a60e1688c199829e8842e38497563c3ab6ea813e527fd" +checksum = "8aa06e5f267dc53efbacb933485c79b6fc1685d3ffbe870a16ce4e696fb429da" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", - "half", - "num", ] [[package]] name = "arrow-row" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "745c114c8f0e8ce211c83389270de6fbe96a9088a7b32c2a041258a443fe83ff" +checksum = "66f1144bb456a2f9d82677bd3abcea019217e572fc8f07de5a7bac4b2c56eb2c" dependencies = [ - "ahash", "arrow-array", "arrow-buffer", "arrow-data", @@ -637,18 +631,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b95513080e728e4cec37f1ff5af4f12c9688d47795d17cda80b6ec2cf74d4678" +checksum = "105f01ec0090259e9a33a9263ec18ff223ab91a0ea9fbc18042f7e38005142f6" dependencies = [ "serde", ] [[package]] name = "arrow-select" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e415279094ea70323c032c6e739c48ad8d80e78a09bef7117b8718ad5bf3722" +checksum = "f690752fdbd2dee278b5f1636fefad8f2f7134c85e20fd59c4199e15a39a6807" dependencies = [ "ahash", "arrow-array", @@ -660,9 +654,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d956cae7002eb8d83a27dbd34daaea1cf5b75852f0b84deb4d93a276e92bbf" +checksum = "d0fff9cd745a7039b66c47ecaf5954460f9fa12eed628f65170117ea93e64ee0" dependencies = [ "arrow-array", "arrow-buffer", @@ -677,11 +671,11 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.8" +version = "0.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07dbbf24db18d609b1462965249abdf49129ccad073ec257da372adc83259c60" +checksum = "df895a515f70646414f4b45c0b79082783b80552b373a68283012928df56f522" dependencies = [ - "brotli 4.0.0", + "brotli 7.0.0", "bzip2 0.4.4", "flate2", "futures-core", @@ -695,9 +689,9 @@ dependencies = [ [[package]] name = "async-stream" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" dependencies = [ "async-stream-impl", "futures-core", @@ -706,24 +700,24 @@ dependencies = [ [[package]] name = "async-stream-impl" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] name = "async-trait" -version = "0.1.82" +version = "0.1.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a27b8a3a6e1a44fa4c8baf1f653e4172e81486d4941f2237e20dc2d0cf4ddff1" +checksum = "644dd749086bf3771a2fbc5f256fdb982d53f011c7d5d560304eafeecebce79d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] @@ -743,22 +737,319 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] name = "autocfg" -version = "1.2.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "aws-config" +version = "1.5.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50236e4d60fe8458de90a71c0922c761e41755adf091b1b03de1cef537179915" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-sdk-sso", + "aws-sdk-ssooidc", + "aws-sdk-sts", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand 2.3.0", + "hex", + "http 0.2.12", + "ring", + "time", + "tokio", + "tracing", + "url", + "zeroize", +] + +[[package]] +name = "aws-credential-types" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60e8f6b615cb5fc60a98132268508ad104310f0cfb25a1c22eee76efdf9154da" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "zeroize", +] + +[[package]] +name = "aws-runtime" +version = "1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76dd04d39cc12844c0994f2c9c5a6f5184c22e9188ec1ff723de41910a21dcad" +dependencies = [ + "aws-credential-types", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand 2.3.0", + "http 0.2.12", + "http-body 0.4.6", + "once_cell", + "percent-encoding", + "pin-project-lite", + "tracing", + "uuid", +] + +[[package]] +name = "aws-sdk-sso" +version = "1.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16ff718c9ee45cc1ebd4774a0e086bb80a6ab752b4902edf1c9f56b86ee1f770" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "http 0.2.12", + "once_cell", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-ssooidc" +version = "1.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5183e088715cc135d8d396fdd3bc02f018f0da4c511f53cb8d795b6a31c55809" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "http 0.2.12", + "once_cell", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-sts" +version = "1.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9f944ef032717596639cea4a2118a3a457268ef51bbb5fde9637e54c465da00" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-query", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "http 0.2.12", + "once_cell", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sigv4" +version = "1.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bc5bbd1e4a2648fd8c5982af03935972c24a2f9846b396de661d351ee3ce837" +dependencies = [ + "aws-credential-types", + "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "form_urlencoded", + "hex", + "hmac", + "http 0.2.12", + "http 1.2.0", + "once_cell", + "percent-encoding", + "sha2", + "time", + "tracing", +] + +[[package]] +name = "aws-smithy-async" +version = "1.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa59d1327d8b5053c54bf2eaae63bf629ba9e904434d0835a28ed3c0ed0a614e" +dependencies = [ + "futures-util", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "aws-smithy-http" +version = "0.60.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc" +dependencies = [ + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "bytes-utils", + "futures-core", + "http 0.2.12", + "http-body 0.4.6", + "once_cell", + "percent-encoding", + "pin-project-lite", + "pin-utils", + "tracing", +] + +[[package]] +name = "aws-smithy-json" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "623a51127f24c30776c8b374295f2df78d92517386f77ba30773f15a30ce1422" +dependencies = [ + "aws-smithy-types", +] + +[[package]] +name = "aws-smithy-query" +version = "0.60.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb" +dependencies = [ + "aws-smithy-types", + "urlencoding", +] + +[[package]] +name = "aws-smithy-runtime" +version = "1.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d526a12d9ed61fadefda24abe2e682892ba288c2018bcb38b1b4c111d13f6d92" +dependencies = [ + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "fastrand 2.3.0", + "h2 0.3.26", + "http 0.2.12", + "http-body 0.4.6", + "http-body 1.0.1", + "httparse", + "hyper 0.14.32", + "hyper-rustls 0.24.2", + "once_cell", + "pin-project-lite", + "pin-utils", + "rustls 0.21.12", + "tokio", + "tracing", +] + +[[package]] +name = "aws-smithy-runtime-api" +version = "1.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92165296a47a812b267b4f41032ff8069ab7ff783696d217f0994a0d7ab585cd" +dependencies = [ + "aws-smithy-async", + "aws-smithy-types", + "bytes", + "http 0.2.12", + "http 1.2.0", + "pin-project-lite", + "tokio", + "tracing", + "zeroize", +] + +[[package]] +name = "aws-smithy-types" +version = "1.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7b8a53819e42f10d0821f56da995e1470b199686a1809168db6ca485665f042" +dependencies = [ + "base64-simd", + "bytes", + "bytes-utils", + "http 0.2.12", + "http 1.2.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", + "itoa", + "num-integer", + "pin-project-lite", + "pin-utils", + "ryu", + "serde", + "time", +] + +[[package]] +name = "aws-smithy-xml" +version = "0.60.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab0b0166827aa700d3dc519f72f8b3a91c35d0b8d042dc5d643a91e6f80648fc" +dependencies = [ + "xmlparser", +] + +[[package]] +name = "aws-types" +version = "1.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbd0a668309ec1f66c0f6bda4840dd6d4796ae26d699ebc266d7cc95c6d040f" +dependencies = [ + "aws-credential-types", + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "rustc_version", + "tracing", +] [[package]] name = "axum" -version = "0.7.5" +version = "0.7.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf" +checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" dependencies = [ "async-trait", "axum-core", "bytes", "futures-util", - "http 1.1.0", - "http-body 1.0.0", + "http 1.2.0", + "http-body 1.0.1", "http-body-util", "itoa", "matchit", @@ -768,64 +1059,68 @@ dependencies = [ "pin-project-lite", "rustversion", "serde", - "sync_wrapper 1.0.1", - "tower", + "sync_wrapper 1.0.2", + "tower 0.5.2", "tower-layer", "tower-service", ] [[package]] name = "axum-core" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e6b8ba012a258d63c9adfa28b9ddcf66149da6f986c5b5452e629d5ee64bf00" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" dependencies = [ "async-trait", "bytes", "futures-util", - "http 1.1.0", - "http-body 1.0.0", + "http 1.2.0", + "http-body 1.0.1", "http-body-util", "mime", "pin-project-lite", "rustversion", - "sync_wrapper 1.0.1", + "sync_wrapper 1.0.2", "tower-layer", "tower-service", ] [[package]] name = "backtrace" -version = "0.3.71" +version = "0.3.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b05800d2e817c8b3b4b54abd461726265fa9789ae34330622f2db9ee696f9d" +checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" dependencies = [ "addr2line", - "cc", "cfg-if", "libc", - "miniz_oxide 0.7.2", + "miniz_oxide", "object", "rustc-demangle", + "windows-targets 0.52.6", ] [[package]] name = "base64" -version = "0.13.1" +version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" [[package]] name = "base64" -version = "0.21.7" +version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] -name = "base64" -version = "0.22.0" +name = "base64-simd" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9475866fec1451be56a3c2400fd081ff546538961565ccb5b7142cbd22bc7a51" +checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" +dependencies = [ + "outref", + "vsimd", +] [[package]] name = "base64ct" @@ -870,9 +1165,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.5.0" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" +checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" [[package]] name = "blake2" @@ -885,9 +1180,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.1" +version = "1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30cca6d3674597c30ddf2c587bf8d9d65c9a84d2326d941cc79c9842dfe0ef52" +checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e" dependencies = [ "arrayref", "arrayvec", @@ -905,17 +1200,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "brotli" -version = "4.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "125740193d7fee5cc63ab9e16c2fdc4e07c74ba755cc53b327d6ea029e9fc569" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", - "brotli-decompressor 3.0.0", -] - [[package]] name = "brotli" version = "6.0.0" @@ -924,7 +1208,7 @@ checksum = "74f7971dbd9326d58187408ab83117d8ac1bb9c17b085fdacd1cf2f598719b6b" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", - "brotli-decompressor 4.0.1", + "brotli-decompressor", ] [[package]] @@ -935,24 +1219,14 @@ checksum = "cc97b8f16f944bba54f0433f07e30be199b6dc2bd25937444bbad560bcea29bd" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", - "brotli-decompressor 4.0.1", + "brotli-decompressor", ] [[package]] name = "brotli-decompressor" -version = "3.0.0" +version = "4.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65622a320492e09b5e0ac436b14c54ff68199bac392d0e89a6832c4518eea525" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", -] - -[[package]] -name = "brotli-decompressor" -version = "4.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a45bd2e4095a8b518033b128020dd4a55aab1c0a381ba4404a472630f4bc362" +checksum = "74fa05ad7d803d413eb8380983b092cbbaf9a85f151b871360e7b00cd7060b37" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -960,9 +1234,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.16.0" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" [[package]] name = "byteorder" @@ -972,15 +1246,25 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.6.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" +checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9" + +[[package]] +name = "bytes-utils" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35" +dependencies = [ + "bytes", + "either", +] [[package]] name = "bytestring" -version = "1.3.1" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74d80203ea6b29df88012294f62733de21cfeab47f17b41af3a38bc30a03ee72" +checksum = "e465647ae23b2823b0753f50decb2d5a86d2bb2cac04788fafd1f80e45378e5f" dependencies = [ "bytes", ] @@ -997,19 +1281,18 @@ dependencies = [ [[package]] name = "bzip2" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bafdbf26611df8c14810e268ddceda071c297570a5fb360ceddf617fe417ef58" +checksum = "75b89e7c29231c673a61a46e722602bcd138298f6b9e81e71119693534585f5c" dependencies = [ "bzip2-sys", - "libc", ] [[package]] name = "bzip2-sys" -version = "0.1.11+1.0.8" +version = "0.1.12+1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +checksum = "72ebc2f1a417f01e1da30ef264ee86ae31d2dcd2d603ea283d3c244a883ca2a9" dependencies = [ "cc", "libc", @@ -1018,18 +1301,18 @@ dependencies = [ [[package]] name = "camino" -version = "1.1.6" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +checksum = "8b96ec4966b5813e2c0507c1f86115c8c5abaadc3980879c3424042a02fd1ad3" dependencies = [ "serde", ] [[package]] name = "cargo-platform" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24b1f0365a6c6bb4020cd05806fd0d33c44d38046b8bd7f0e40814b9763cabfc" +checksum = "e35af189006b9c0f00a064685c727031e3ed2d8020f7ba284d78cc2671bd36ea" dependencies = [ "serde", ] @@ -1045,14 +1328,14 @@ dependencies = [ "semver", "serde", "serde_json", - "thiserror 1.0.64", + "thiserror 1.0.69", ] [[package]] name = "cargo_toml" -version = "0.20.1" +version = "0.20.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35a60491a82bdc0440640298990087ac1625e23c2feacd584eb33775903d5bb3" +checksum = "88da5a13c620b4ca0078845707ea9c3faf11edbc3ffd8497d11d686211cd1ac0" dependencies = [ "serde", "toml", @@ -1060,9 +1343,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.23" +version = "1.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bbb537bb4a30b90362caddba8f360c0a56bc13d3a5570028e7197204cb54a17" +checksum = "c7777341816418c02e033934a09f20dc0ccaf65a5201ef8a450ae0105a573fda" dependencies = [ "jobserver", "libc", @@ -1075,6 +1358,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "change-detection" version = "1.2.0" @@ -1087,9 +1376,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.38" +version = "0.4.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" dependencies = [ "android-tzdata", "iana-time-zone", @@ -1111,9 +1400,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd6dd8046d00723a59a2f8c5f295c515b9bb9a331ee4f8f3d4dd49e428acd3b6" +checksum = "9c6ac4f2c0bf0f44e9161aec9675e1050aa4a530663c4a9e37e108fa948bca9f" dependencies = [ "chrono", "chrono-tz-build", @@ -1132,9 +1421,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.4" +version = "4.5.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0" +checksum = "8acebd8ad879283633b343856142139f2da2317c96b05b4dd6181c61e2480184" dependencies = [ "clap_builder", "clap_derive", @@ -1142,9 +1431,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.2" +version = "4.5.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" +checksum = "f6ba32cbda51c7e1dfd49acc1457ba1a7dec5b64fe360e828acb13ca8dc9c2f9" dependencies = [ "anstream", "anstyle", @@ -1153,21 +1442,21 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.4" +version = "4.5.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "528131438037fd55894f62d6e9f068b8f45ac57ffa77517819645d10aed04f64" +checksum = "bf4ced95c6f4a675af3da73304b9ac4ed991640c36374e4b46795c49e17cf1ed" dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] name = "clap_lex" -version = "0.7.0" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" [[package]] name = "clokwerk" @@ -1180,18 +1469,17 @@ dependencies = [ [[package]] name = "colorchoice" -version = "1.0.0" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" [[package]] name = "comfy-table" -version = "7.1.1" +version = "7.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7" +checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a" dependencies = [ - "strum", - "strum_macros", + "unicode-segmentation", "unicode-width", ] @@ -1210,16 +1498,16 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" dependencies = [ - "getrandom", + "getrandom 0.2.15", "once_cell", "tiny-keccak", ] [[package]] name = "constant_time_eq" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" [[package]] name = "convert_case" @@ -1258,17 +1546,27 @@ dependencies = [ "libc", ] +[[package]] +name = "core-foundation" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b55271e5c8c478ad3f38ad24ef34923091e0548492a266d19b3c0b4d82574c63" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation-sys" -version = "0.8.6" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "cpufeatures" -version = "0.2.12" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" dependencies = [ "libc", ] @@ -1284,9 +1582,9 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" dependencies = [ "crossbeam-epoch", "crossbeam-utils", @@ -1303,9 +1601,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.20" +version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crossterm" @@ -1313,11 +1611,11 @@ version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6" dependencies = [ - "bitflags 2.5.0", + "bitflags 2.8.0", "crossterm_winapi", - "mio 1.0.2", + "mio", "parking_lot", - "rustix 0.38.34", + "rustix", "signal-hook", "signal-hook-mio", "winapi", @@ -1334,9 +1632,9 @@ dependencies = [ [[package]] name = "crunchy" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" [[package]] name = "crypto-common" @@ -1350,9 +1648,9 @@ dependencies = [ [[package]] name = "csv" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" dependencies = [ "csv-core", "itoa", @@ -1371,9 +1669,9 @@ dependencies = [ [[package]] name = "darling" -version = "0.20.8" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54e36fcd13ed84ffdfda6f5be89b31287cbb80c439841fe69e04841435464391" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" dependencies = [ "darling_core", "darling_macro", @@ -1381,27 +1679,27 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.20.8" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c2cf1c23a687a1feeb728783b993c4e1ad83d99f351801977dd809b48d0a70f" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", "strsim", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] name = "darling_macro" -version = "0.20.8" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a668eda54683121533a393014d8692171709ff57a7d61f187b6e782719f8933f" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] @@ -1420,27 +1718,24 @@ dependencies = [ [[package]] name = "data-encoding" -version = "2.5.0" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e962a19be5cfc3f3bf6dd8f61eb50107f356ad6270fbb3ed41476571db78be5" +checksum = "575f75dfd25738df5b91b8e43e14d44bda14637a58fae779fd2b064f8bf3e010" [[package]] name = "datafusion" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "014fc8c384ecacedaabb3bc8359c2a6c6e9d8f7bea65be3434eccacfc37f52d9" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "arrow", - "arrow-array", "arrow-ipc", "arrow-schema", - "async-compression", "async-trait", "bytes", - "bzip2 0.5.0", + "bzip2 0.5.1", "chrono", - "dashmap", "datafusion-catalog", + "datafusion-catalog-listing", "datafusion-common", "datafusion-common-runtime", "datafusion-execution", @@ -1450,6 +1745,7 @@ dependencies = [ "datafusion-functions-nested", "datafusion-functions-table", "datafusion-functions-window", + "datafusion-macros", "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -1458,18 +1754,16 @@ dependencies = [ "datafusion-sql", "flate2", "futures", - "glob", - "itertools 0.13.0", + "itertools 0.14.0", "log", "object_store", "parking_lot", "parquet", - "rand", + "rand 0.8.5", "regex", "sqlparser", "tempfile", "tokio", - "tokio-util", "url", "uuid", "xz2", @@ -1478,33 +1772,68 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee60d33e210ef96070377ae667ece7caa0e959c8387496773d4a1a72f1a5012e" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ - "arrow-schema", + "arrow", "async-trait", + "dashmap", "datafusion-common", "datafusion-execution", "datafusion-expr", "datafusion-physical-plan", + "datafusion-sql", + "futures", + "itertools 0.14.0", + "log", "parking_lot", ] +[[package]] +name = "datafusion-catalog-listing" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +dependencies = [ + "arrow", + "async-compression", + "async-trait", + "bytes", + "bzip2 0.5.1", + "chrono", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "flate2", + "futures", + "glob", + "itertools 0.14.0", + "log", + "object_store", + "rand 0.8.5", + "tokio", + "tokio-util", + "url", + "xz2", + "zstd", +] + [[package]] name = "datafusion-common" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b42b7d720fe21ed9cca2ebb635f3f13a12cfab786b41e0fba184fb2e620525b" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-ipc", + "base64 0.22.1", "half", "hashbrown 0.14.5", - "indexmap 2.5.0", + "indexmap 2.7.1", "libc", "log", "object_store", @@ -1518,9 +1847,8 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72fbf14d4079f7ce5306393084fe5057dddfdc2113577e0049310afa12e94281" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "log", "tokio", @@ -1528,15 +1856,13 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c278dbd64860ed0bb5240fc1f4cb6aeea437153910aea69bcf7d5a8d6d0454f3" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" [[package]] name = "datafusion-execution" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e22cb02af47e756468b3cbfee7a83e3d4f2278d452deb4b033ba933c75169486" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "arrow", "dashmap", @@ -1546,16 +1872,15 @@ dependencies = [ "log", "object_store", "parking_lot", - "rand", + "rand 0.8.5", "tempfile", "url", ] [[package]] name = "datafusion-expr" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62298eadb1d15b525df1315e61a71519ffc563d41d5c3b2a30fda2d70f77b93c" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "arrow", "chrono", @@ -1565,7 +1890,7 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr-common", - "indexmap 2.5.0", + "indexmap 2.7.1", "paste", "recursive", "serde_json", @@ -1574,24 +1899,23 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dda7f73c5fc349251cd3dcb05773c5bf55d2505a698ef9d38dfc712161ea2f55" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "arrow", "datafusion-common", - "itertools 0.13.0", + "itertools 0.14.0", + "paste", ] [[package]] name = "datafusion-functions" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd197f3b2975424d3a4898ea46651be855a46721a56727515dbd5c9e2fb597da" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "arrow", "arrow-buffer", - "base64 0.22.0", + "base64 0.22.1", "blake2", "blake3", "chrono", @@ -1601,12 +1925,11 @@ dependencies = [ "datafusion-expr", "datafusion-expr-common", "datafusion-macros", - "hashbrown 0.14.5", "hex", - "itertools 0.13.0", + "itertools 0.14.0", "log", "md-5", - "rand", + "rand 0.8.5", "regex", "sha2", "unicode-segmentation", @@ -1615,13 +1938,11 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aabbe48fba18f9981b134124381bee9e46f93518b8ad2f9721ee296cef5affb9" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "ahash", "arrow", - "arrow-schema", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -1637,9 +1958,8 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7a3fefed9c8c11268d446d924baca8cabf52fe32f73fdaa20854bac6473590c" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "ahash", "arrow", @@ -1650,31 +1970,28 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6360f27464fab857bec698af39b2ae331dc07c8bf008fb4de387a19cdc6815a5" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "arrow", - "arrow-array", - "arrow-buffer", "arrow-ord", - "arrow-schema", "datafusion-common", + "datafusion-doc", "datafusion-execution", "datafusion-expr", "datafusion-functions", "datafusion-functions-aggregate", + "datafusion-macros", "datafusion-physical-expr-common", - "itertools 0.13.0", + "itertools 0.14.0", "log", "paste", ] [[package]] name = "datafusion-functions-table" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c35c070eb705c12795dab399c3809f4dfbc290678c624d3989490ca9b8449c1" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "arrow", "async-trait", @@ -1688,9 +2005,8 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52229bca26b590b140900752226c829f15fc1a99840e1ca3ce1a9534690b82a8" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1705,9 +2021,8 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "367befc303b64a668a10ae6988a064a9289e1999e71a7f8e526b6e14d6bdd9d6" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1715,27 +2030,26 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5de3c8f386ea991696553afe241a326ecbc3c98a12c562867e4be754d3a060c" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ + "datafusion-expr", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] name = "datafusion-optimizer" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53b520413906f755910422b016fb73884ae6e9e1b376de4f9584b6c0e031da75" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "arrow", "chrono", "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "indexmap 2.5.0", - "itertools 0.13.0", + "indexmap 2.7.1", + "itertools 0.14.0", "log", "recursive", "regex", @@ -1744,15 +2058,11 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acd6ddc378f6ad19af95ccd6790dec8f8e1264bc4c70e99ddc1830c1a1c78ccd" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", - "arrow-schema", "datafusion-common", "datafusion-expr", "datafusion-expr-common", @@ -1760,8 +2070,8 @@ dependencies = [ "datafusion-physical-expr-common", "half", "hashbrown 0.14.5", - "indexmap 2.5.0", - "itertools 0.13.0", + "indexmap 2.7.1", + "itertools 0.14.0", "log", "paste", "petgraph", @@ -1769,45 +2079,42 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06e6c05458eccd74b4c77ed6a1fe63d52434240711de7f6960034794dad1caf5" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "ahash", "arrow", "datafusion-common", "datafusion-expr-common", "hashbrown 0.14.5", - "itertools 0.13.0", + "itertools 0.14.0", ] [[package]] name = "datafusion-physical-optimizer" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dc3a82190f49c37d377f31317e07ab5d7588b837adadba8ac367baad5dc2351" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "arrow", "datafusion-common", "datafusion-execution", + "datafusion-expr", "datafusion-expr-common", "datafusion-physical-expr", + "datafusion-physical-expr-common", "datafusion-physical-plan", - "itertools 0.13.0", + "itertools 0.14.0", "log", "recursive", ] [[package]] name = "datafusion-physical-plan" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a6608bc9844b4ddb5ed4e687d173e6c88700b1d0482f43894617d18a1fe75da" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", "arrow-ord", "arrow-schema", "async-trait", @@ -1822,8 +2129,8 @@ dependencies = [ "futures", "half", "hashbrown 0.14.5", - "indexmap 2.5.0", - "itertools 0.13.0", + "indexmap 2.7.1", + "itertools 0.14.0", "log", "parking_lot", "pin-project-lite", @@ -1832,17 +2139,14 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "44.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a884061c79b33d0c8e84a6f4f4be8bdc12c0f53f5af28ddf5d6d95ac0b15fdc" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" dependencies = [ "arrow", - "arrow-array", - "arrow-schema", "bigdecimal", "datafusion-common", "datafusion-expr", - "indexmap 2.5.0", + "indexmap 2.7.1", "log", "recursive", "regex", @@ -1860,26 +2164,26 @@ dependencies = [ [[package]] name = "derive_arbitrary" -version = "1.3.2" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611" +checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] name = "derive_more" -version = "0.99.18" +version = "0.99.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce" +checksum = "3da29a38df43d6f156149c9b43ded5e018ddff2a855cf2cfd62e8cd7d079c69f" dependencies = [ "convert_case", "proc-macro2", "quote", "rustc_version", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] @@ -1893,6 +2197,27 @@ dependencies = [ "subtle", ] +[[package]] +name = "dirs" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.59.0", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -1901,20 +2226,20 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] name = "either" -version = "1.11.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" [[package]] name = "encoding_rs" -version = "0.8.34" +version = "0.8.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" dependencies = [ "cfg-if", ] @@ -1927,12 +2252,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.8" +version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -1946,21 +2271,21 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.0.2" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "658bd65b1cf4c852a3cc96f18a8ce7b5640f6b703f905c7d74532294c2a63984" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "fixedbitset" -version = "0.4.2" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" [[package]] name = "flatbuffers" -version = "24.3.25" +version = "24.12.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8add37afff2d4ffa83bc748a70b4b1370984f6980768554182424ef71447c35f" +checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" dependencies = [ "bitflags 1.3.2", "rustc_version", @@ -1968,12 +2293,12 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.34" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" +checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" dependencies = [ "crc32fast", - "miniz_oxide 0.8.0", + "miniz_oxide", ] [[package]] @@ -1999,9 +2324,9 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "futures" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", @@ -2014,9 +2339,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -2024,15 +2349,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" dependencies = [ "futures-core", "futures-task", @@ -2041,9 +2366,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-lite" @@ -2062,26 +2387,26 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] name = "futures-sink" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-timer" @@ -2091,9 +2416,9 @@ checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" [[package]] name = "futures-util" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-channel", "futures-core", @@ -2119,9 +2444,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "js-sys", @@ -2130,17 +2455,29 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "getrandom" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.13.3+wasi-0.2.2", + "windows-targets 0.52.6", +] + [[package]] name = "gimli" -version = "0.28.1" +version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "glob" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" [[package]] name = "h2" @@ -2154,7 +2491,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.12", - "indexmap 2.5.0", + "indexmap 2.7.1", "slab", "tokio", "tokio-util", @@ -2163,17 +2500,17 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.5" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" +checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e" dependencies = [ "atomic-waker", "bytes", "fnv", "futures-core", "futures-sink", - "http 1.1.0", - "indexmap 2.5.0", + "http 1.2.0", + "indexmap 2.7.1", "slab", "tokio", "tokio-util", @@ -2222,12 +2559,6 @@ dependencies = [ "unicode-segmentation", ] -[[package]] -name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - [[package]] name = "heck" version = "0.5.0" @@ -2246,6 +2577,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + [[package]] name = "hostname" version = "0.4.0" @@ -2270,9 +2610,9 @@ dependencies = [ [[package]] name = "http" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea" dependencies = [ "bytes", "fnv", @@ -2281,11 +2621,11 @@ dependencies = [ [[package]] name = "http-auth-basic" -version = "0.3.3" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd2e17aacf7f4a2428def798e2ff4f4f883c0987bdaf47dd5c8bc027bc9f1ebc" +checksum = "0e0c088bddfd73005b09807131224ad12c302655436b1270c8346a3ae8aaa37a" dependencies = [ - "base64 0.13.1", + "base64 0.22.1", ] [[package]] @@ -2301,32 +2641,32 @@ dependencies = [ [[package]] name = "http-body" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http 1.1.0", + "http 1.2.0", ] [[package]] name = "http-body-util" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0475f8b2ac86659c21b64320d5d653f9efe42acd2a4e560073ec61a155a34f1d" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" dependencies = [ "bytes", - "futures-core", - "http 1.1.0", - "http-body 1.0.0", + "futures-util", + "http 1.2.0", + "http-body 1.0.1", "pin-project-lite", ] [[package]] name = "httparse" -version = "1.8.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" +checksum = "f2d708df4e7140240a16cd6ab0ab65c972d7433ab77819ea693fde9c43811e2a" [[package]] name = "httpdate" @@ -2358,9 +2698,9 @@ dependencies = [ [[package]] name = "hyper" -version = "0.14.30" +version = "0.14.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a152ddd61dfaec7273fe8419ab357f33aee0d914c5f4efbf0d96fa749eea5ec9" +checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" dependencies = [ "bytes", "futures-channel", @@ -2382,16 +2722,16 @@ dependencies = [ [[package]] name = "hyper" -version = "1.4.1" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" +checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" dependencies = [ "bytes", "futures-channel", "futures-util", - "h2 0.4.5", - "http 1.1.0", - "http-body 1.0.0", + "h2 0.4.7", + "http 1.2.0", + "http-body 1.0.1", "httparse", "httpdate", "itoa", @@ -2402,92 +2742,211 @@ dependencies = [ ] [[package]] -name = "hyper-rustls" -version = "0.24.2" +name = "hyper-rustls" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" +dependencies = [ + "futures-util", + "http 0.2.12", + "hyper 0.14.32", + "log", + "rustls 0.21.12", + "rustls-native-certs 0.6.3", + "tokio", + "tokio-rustls 0.24.1", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2" +dependencies = [ + "futures-util", + "http 1.2.0", + "hyper 1.6.0", + "hyper-util", + "rustls 0.23.23", + "rustls-native-certs 0.8.1", + "rustls-pki-types", + "tokio", + "tokio-rustls 0.26.1", + "tower-service", + "webpki-roots 0.26.8", +] + +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper 1.6.0", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http 1.2.0", + "http-body 1.0.1", + "hyper 1.6.0", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core 0.52.0", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" -dependencies = [ - "futures-util", - "http 0.2.12", - "hyper 0.14.30", - "rustls 0.21.12", - "tokio", - "tokio-rustls 0.24.1", -] +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" [[package]] -name = "hyper-rustls" -version = "0.27.3" +name = "icu_normalizer" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" dependencies = [ - "futures-util", - "http 1.1.0", - "hyper 1.4.1", - "hyper-util", - "rustls 0.23.13", - "rustls-native-certs", - "rustls-pki-types", - "tokio", - "tokio-rustls 0.26.0", - "tower-service", - "webpki-roots 0.26.1", + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", ] [[package]] -name = "hyper-timeout" -version = "0.5.1" +name = "icu_normalizer_data" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3203a961e5c83b6f5498933e78b6b263e208c197b63e9c6c53cc82ffd3f63793" -dependencies = [ - "hyper 1.4.1", - "hyper-util", - "pin-project-lite", - "tokio", - "tower-service", -] +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" [[package]] -name = "hyper-util" -version = "0.1.6" +name = "icu_properties" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ab92f4f49ee4fb4f997c784b7a2e0fa70050211e0b6a287f898c3c9785ca956" +checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" dependencies = [ - "bytes", - "futures-channel", - "futures-util", - "http 1.1.0", - "http-body 1.0.0", - "hyper 1.4.1", - "pin-project-lite", - "socket2", - "tokio", - "tower", - "tower-service", - "tracing", + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", ] [[package]] -name = "iana-time-zone" -version = "0.1.60" +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" dependencies = [ - "android_system_properties", - "core-foundation-sys", - "iana-time-zone-haiku", - "js-sys", - "wasm-bindgen", - "windows-core 0.52.0", + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", ] [[package]] -name = "iana-time-zone-haiku" -version = "0.1.2" +name = "icu_provider_macros" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ - "cc", + "proc-macro2", + "quote", + "syn 2.0.98", ] [[package]] @@ -2506,11 +2965,32 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "idna" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + [[package]] name = "impl-more" -version = "0.1.6" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d" +checksum = "e8a5a9a0ff0086c7a148acb942baaabeadf9504d10400b5a05645853729b9cd2" [[package]] name = "indexmap" @@ -2524,19 +3004,19 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.5.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" +checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" dependencies = [ "equivalent", - "hashbrown 0.14.5", + "hashbrown 0.15.2", ] [[package]] name = "instant" -version = "0.1.12" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" dependencies = [ "cfg-if", ] @@ -2548,21 +3028,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] -name = "io-lifetimes" -version = "1.0.11" +name = "ipnet" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" -dependencies = [ - "hermit-abi", - "libc", - "windows-sys 0.48.0", -] +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] -name = "ipnet" -version = "2.9.0" +name = "is_terminal_polyfill" +version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "itertools" @@ -2582,27 +3057,37 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" -version = "1.0.11" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" [[package]] name = "jobserver" -version = "0.1.31" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" dependencies = [ "libc", ] [[package]] name = "js-sys" -version = "0.3.69" +version = "0.3.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" dependencies = [ + "once_cell", "wasm-bindgen", ] @@ -2614,9 +3099,9 @@ checksum = "d4345964bb142484797b161f473a503a434de77149dd8c7427788c6e13379388" [[package]] name = "lazy_static" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lexical-core" @@ -2684,27 +3169,37 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.153" +version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "libm" -version = "0.2.8" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" [[package]] -name = "linux-raw-sys" -version = "0.1.4" +name = "libredox" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" +dependencies = [ + "bitflags 2.8.0", + "libc", +] [[package]] name = "linux-raw-sys" -version = "0.4.13" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + +[[package]] +name = "litemap" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" [[package]] name = "local-channel" @@ -2725,9 +3220,9 @@ checksum = "4d873d7c67ce09b42110d801813efbc9364414e356be9935700d368351657487" [[package]] name = "lock_api" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" dependencies = [ "autocfg", "scopeguard", @@ -2741,9 +3236,9 @@ checksum = "9374ef4228402d4b7e403e5838cb880d9ee663314b0a900d5a6aabf0c213552e" [[package]] name = "log" -version = "0.4.21" +version = "0.4.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" [[package]] name = "lz4_flex" @@ -2813,9 +3308,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "mime_guess" -version = "2.0.4" +version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef" +checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e" dependencies = [ "mime", "unicase", @@ -2829,41 +3324,19 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" -dependencies = [ - "adler", -] - -[[package]] -name = "miniz_oxide" -version = "0.8.0" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +checksum = "b3b1c9bd4fe1f0f8b387f6eb9eb3b4a1aa26185e5750efb9140301703f62cd1b" dependencies = [ "adler2", ] [[package]] name = "mio" -version = "0.8.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" -dependencies = [ - "libc", - "log", - "wasi 0.11.0+wasi-snapshot-preview1", - "windows-sys 0.48.0", -] - -[[package]] -name = "mio" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ - "hermit-abi", "libc", "log", "wasi 0.11.0+wasi-snapshot-preview1", @@ -2901,9 +3374,9 @@ dependencies = [ [[package]] name = "num" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3135b08af27d103b0a51f2ae0f8632117b7b185ccf931445affa8df530576a41" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" dependencies = [ "num-bigint", "num-complex", @@ -2915,20 +3388,19 @@ dependencies = [ [[package]] name = "num-bigint" -version = "0.4.4" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "608e7659b5c3d7cba262d894801b9ec9d00de989e8a82bd4bef91d08da45cdc0" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ - "autocfg", "num-integer", "num-traits", ] [[package]] name = "num-complex" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23c6602fda94a57c990fe0df199a035d83576b496aa29f4e634a8ac6004e68a6" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" dependencies = [ "num-traits", ] @@ -2950,9 +3422,9 @@ dependencies = [ [[package]] name = "num-iter" -version = "0.1.44" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d869c01cc0c455284163fd0092f1f93835385ccab5a98a0dcc497b2f8bf055a9" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" dependencies = [ "autocfg", "num-integer", @@ -2961,11 +3433,10 @@ dependencies = [ [[package]] name = "num-rational" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" dependencies = [ - "autocfg", "num-bigint", "num-integer", "num-traits", @@ -2973,9 +3444,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", "libm", @@ -3009,7 +3480,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] @@ -3023,9 +3494,9 @@ dependencies = [ [[package]] name = "object" -version = "0.32.2" +version = "0.36.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" dependencies = [ "memchr", ] @@ -3037,21 +3508,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3cfccb68961a56facde1163f9319e0d15743352344e7808a11795fb99698dcaf" dependencies = [ "async-trait", - "base64 0.22.0", + "base64 0.22.1", "bytes", "chrono", "futures", "httparse", "humantime", - "hyper 1.4.1", + "hyper 1.6.0", "itertools 0.13.0", "md-5", "parking_lot", "percent-encoding", "quick-xml", - "rand", - "reqwest 0.12.8", + "rand 0.8.5", + "reqwest 0.12.12", "ring", + "rustls-pemfile 2.2.0", "serde", "serde_json", "snafu", @@ -3063,9 +3535,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.19.0" +version = "1.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" [[package]] name = "openid" @@ -3073,42 +3545,42 @@ version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "627898ab5b3fff5e5f1dc0e404bafdbb87a4337d815e86149f53640380946ccc" dependencies = [ - "base64 0.22.0", + "base64 0.22.1", "biscuit", "chrono", "lazy_static", "mime", - "reqwest 0.12.8", + "reqwest 0.12.12", "serde", "serde_json", - "thiserror 1.0.64", + "thiserror 1.0.69", "url", "validator", ] [[package]] name = "openssl-probe" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" [[package]] name = "opentelemetry" version = "0.27.1" -source = "git+https://github.com/parseablehq/opentelemetry-rust?branch=fix-metrics-u64-serialization#7e84c98d75ae16993a37bd5ff75a9768d652fe8f" +source = "git+https://github.com/parseablehq/opentelemetry-rust?branch=fix-metrics-u64-serialization#7b113419751fc4a785107d9b6d3b18284f406076" dependencies = [ "futures-core", "futures-sink", "js-sys", "pin-project-lite", - "thiserror 2.0.9", + "thiserror 2.0.11", "tracing", ] [[package]] name = "opentelemetry-proto" version = "0.27.0" -source = "git+https://github.com/parseablehq/opentelemetry-rust?branch=fix-metrics-u64-serialization#7e84c98d75ae16993a37bd5ff75a9768d652fe8f" +source = "git+https://github.com/parseablehq/opentelemetry-rust?branch=fix-metrics-u64-serialization#7b113419751fc4a785107d9b6d3b18284f406076" dependencies = [ "hex", "opentelemetry", @@ -3116,12 +3588,13 @@ dependencies = [ "prost", "serde", "tonic", + "tracing", ] [[package]] name = "opentelemetry_sdk" version = "0.27.1" -source = "git+https://github.com/parseablehq/opentelemetry-rust?branch=fix-metrics-u64-serialization#7e84c98d75ae16993a37bd5ff75a9768d652fe8f" +source = "git+https://github.com/parseablehq/opentelemetry-rust?branch=fix-metrics-u64-serialization#7b113419751fc4a785107d9b6d3b18284f406076" dependencies = [ "async-trait", "futures-channel", @@ -3130,11 +3603,17 @@ dependencies = [ "glob", "opentelemetry", "percent-encoding", - "rand", + "rand 0.8.5", "serde_json", - "thiserror 2.0.9", + "thiserror 2.0.11", ] +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + [[package]] name = "ordered-float" version = "2.10.1" @@ -3144,6 +3623,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "outref" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" + [[package]] name = "overload" version = "0.1.1" @@ -3152,15 +3637,15 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "parking" -version = "2.2.0" +version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb813b8af86854136c6922af0598d719255ecb2179515e6e7730d468f05c9cae" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" [[package]] name = "parking_lot" -version = "0.12.1" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" dependencies = [ "lock_api", "parking_lot_core", @@ -3168,22 +3653,22 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.9" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", "redox_syscall", "smallvec", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] name = "parquet" -version = "53.3.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b449890367085eb65d7d3321540abc3d7babbd179ce31df0016e90719114191" +checksum = "8a01a0efa30bbd601ae85b375c728efdb211ade54390281628a7b16708beb235" dependencies = [ "ahash", "arrow-array", @@ -3193,7 +3678,7 @@ dependencies = [ "arrow-ipc", "arrow-schema", "arrow-select", - "base64 0.22.0", + "base64 0.22.1", "brotli 7.0.0", "bytes", "chrono", @@ -3207,6 +3692,7 @@ dependencies = [ "object_store", "paste", "seq-macro", + "simdutf8", "snap", "thrift", "tokio", @@ -3217,9 +3703,9 @@ dependencies = [ [[package]] name = "parse-zoneinfo" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c705f256449c60da65e11ff6626e0c16a0a0b96aaa348de61376b249bc340f41" +checksum = "1f2a05b18d44e2957b88f96ba460715e295bc1d7510468a2f3d3b44535d26c24" dependencies = [ "regex", ] @@ -3237,13 +3723,17 @@ dependencies = [ "argon2", "arrow", "arrow-array", + "arrow-buffer", "arrow-flight", "arrow-ipc", "arrow-json", + "arrow-ord", "arrow-schema", "arrow-select", "async-trait", - "base64 0.22.0", + "aws-config", + "aws-credential-types", + "base64 0.22.1", "byteorder", "bytes", "cargo_toml", @@ -3255,6 +3745,7 @@ dependencies = [ "crossterm", "datafusion", "derive_more", + "dirs", "fs_extra", "futures", "futures-util", @@ -3273,18 +3764,19 @@ dependencies = [ "once_cell", "openid", "opentelemetry-proto", + "parking_lot", "parquet", "path-clean", "prometheus", "prometheus-parse", - "rand", + "rand 0.8.5", "rdkafka", "regex", "relative-path", "reqwest 0.11.27", "rstest", "rustls 0.22.4", - "rustls-pemfile 2.1.2", + "rustls-pemfile 2.2.0", "semver", "serde", "serde_json", @@ -3294,7 +3786,7 @@ dependencies = [ "static-files", "structopt-derive", "sysinfo", - "thiserror 2.0.9", + "thiserror 2.0.11", "tokio", "tokio-stream", "tonic", @@ -3318,7 +3810,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "346f04948ba92c43e8469c1ee6736c7563d71012b17d40745260fe106aac2166" dependencies = [ "base64ct", - "rand_core", + "rand_core 0.6.4", "subtle", ] @@ -3357,28 +3849,28 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "petgraph" -version = "0.6.4" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ "fixedbitset", - "indexmap 2.5.0", + "indexmap 2.7.1", ] [[package]] name = "phf" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" dependencies = [ "phf_shared", ] [[package]] name = "phf_codegen" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" dependencies = [ "phf_generator", "phf_shared", @@ -3386,48 +3878,48 @@ dependencies = [ [[package]] name = "phf_generator" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" dependencies = [ "phf_shared", - "rand", + "rand 0.8.5", ] [[package]] name = "phf_shared" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" dependencies = [ "siphasher", ] [[package]] name = "pin-project" -version = "1.1.5" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" +checksum = "dfe2e71e1471fe07709406bf725f710b02927c9c54b2b5b2ec0e8087d97c327d" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.5" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" +checksum = "f6e859e6e5bd50440ab63c47e3ebabc90f26251f7c73c3d3e837b74a1cc3fa67" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] name = "pin-project-lite" -version = "0.2.14" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" [[package]] name = "pin-utils" @@ -3437,9 +3929,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" +checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" [[package]] name = "powerfmt" @@ -3449,9 +3941,12 @@ checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" [[package]] name = "ppv-lite86" -version = "0.2.17" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy 0.7.35", +] [[package]] name = "proc-macro-crate" @@ -3488,31 +3983,41 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.86" +version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" dependencies = [ "unicode-ident", ] [[package]] name = "procfs" -version = "0.14.2" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" dependencies = [ - "bitflags 1.3.2", - "byteorder", + "bitflags 2.8.0", "hex", "lazy_static", - "rustix 0.36.17", + "procfs-core", + "rustix", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.8.0", + "hex", ] [[package]] name = "prometheus" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c" +checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1" dependencies = [ "cfg-if", "fnv", @@ -3522,7 +4027,7 @@ dependencies = [ "parking_lot", "procfs", "protobuf", - "thiserror 1.0.64", + "thiserror 1.0.69", ] [[package]] @@ -3539,9 +4044,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.13.3" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f" +checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" dependencies = [ "bytes", "prost-derive", @@ -3549,22 +4054,22 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.13.3" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" +checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.13.0", + "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] name = "prost-types" -version = "0.13.3" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670" +checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" dependencies = [ "prost", ] @@ -3612,57 +4117,61 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.5" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c7c5fdde3cdae7203427dc4f0a68fe0ed09833edc525a03456b153b79828684" +checksum = "62e96808277ec6f97351a2380e6c25114bc9e67037775464979f3037c92d05ef" dependencies = [ "bytes", "pin-project-lite", "quinn-proto", "quinn-udp", "rustc-hash", - "rustls 0.23.13", + "rustls 0.23.23", "socket2", - "thiserror 1.0.64", + "thiserror 2.0.11", "tokio", "tracing", ] [[package]] name = "quinn-proto" -version = "0.11.8" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6" +checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" dependencies = [ "bytes", - "rand", + "getrandom 0.2.15", + "rand 0.8.5", "ring", "rustc-hash", - "rustls 0.23.13", + "rustls 0.23.23", + "rustls-pki-types", "slab", - "thiserror 1.0.64", + "thiserror 2.0.11", "tinyvec", "tracing", + "web-time", ] [[package]] name = "quinn-udp" -version = "0.5.4" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bffec3605b73c6f1754535084a85229fa8a30f86014e6c81aeec4abb68b0285" +checksum = "1c40286217b4ba3a71d644d752e6a0b71f13f1b6a2c5311acfcbe0c2418ed904" dependencies = [ + "cfg_aliases", "libc", "once_cell", "socket2", "tracing", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] name = "quote" -version = "1.0.37" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" dependencies = [ "proc-macro2", ] @@ -3674,8 +4183,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.0", + "zerocopy 0.8.17", ] [[package]] @@ -3685,7 +4205,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.0", ] [[package]] @@ -3694,7 +4224,17 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.15", +] + +[[package]] +name = "rand_core" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b08f3c9802962f7e1b25113931d94f43ed9725bebc59db9d0c3e9a23b67e15ff" +dependencies = [ + "getrandom 0.3.1", + "zerocopy 0.8.17", ] [[package]] @@ -3772,27 +4312,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] name = "redox_syscall" -version = "0.4.1" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.8.0", +] + +[[package]] +name = "redox_users" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd6f9d3d47bdd2ad6945c5015a226ec6155d0bcdfd8f7cd29f86b71f8de99d2b" +dependencies = [ + "getrandom 0.2.15", + "libredox", + "thiserror 2.0.11", ] [[package]] name = "regex" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.8", + "regex-automata 0.4.9", "regex-syntax 0.8.5", ] @@ -3807,9 +4358,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -3858,7 +4409,7 @@ dependencies = [ "h2 0.3.26", "http 0.2.12", "http-body 0.4.6", - "hyper 0.14.30", + "hyper 0.14.32", "hyper-rustls 0.24.2", "ipnet", "js-sys", @@ -3888,20 +4439,20 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.8" +version = "0.12.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f713147fbe92361e52392c73b8c9e48c04c6625bce969ef54dc901e58e042a7b" +checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" dependencies = [ - "base64 0.22.0", + "base64 0.22.1", "bytes", "futures-core", "futures-util", - "h2 0.4.5", - "http 1.1.0", - "http-body 1.0.0", + "h2 0.4.7", + "http 1.2.0", + "http-body 1.0.1", "http-body-util", - "hyper 1.4.1", - "hyper-rustls 0.27.3", + "hyper 1.6.0", + "hyper-rustls 0.27.5", "hyper-util", "ipnet", "js-sys", @@ -3911,24 +4462,25 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.13", - "rustls-native-certs", - "rustls-pemfile 2.1.2", + "rustls 0.23.23", + "rustls-native-certs 0.8.1", + "rustls-pemfile 2.2.0", "rustls-pki-types", "serde", "serde_json", "serde_urlencoded", - "sync_wrapper 1.0.1", + "sync_wrapper 1.0.2", "tokio", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.1", "tokio-util", + "tower 0.5.2", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", "wasm-streams", "web-sys", - "webpki-roots 0.26.1", + "webpki-roots 0.26.8", "windows-registry", ] @@ -3940,7 +4492,7 @@ checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" dependencies = [ "cc", "cfg-if", - "getrandom", + "getrandom 0.2.15", "libc", "spin", "untrusted", @@ -3973,21 +4525,21 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.87", + "syn 2.0.98", "unicode-ident", ] [[package]] name = "rustc-demangle" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustc-hash" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" [[package]] name = "rustc_version" @@ -4000,29 +4552,15 @@ dependencies = [ [[package]] name = "rustix" -version = "0.36.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "305efbd14fde4139eb501df5f136994bb520b033fa9fbdce287507dc23b8c7ed" -dependencies = [ - "bitflags 1.3.2", - "errno", - "io-lifetimes", - "libc", - "linux-raw-sys 0.1.4", - "windows-sys 0.45.0", -] - -[[package]] -name = "rustix" -version = "0.38.34" +version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags 2.5.0", + "bitflags 2.8.0", "errno", "libc", - "linux-raw-sys 0.4.13", - "windows-sys 0.52.0", + "linux-raw-sys", + "windows-sys 0.59.0", ] [[package]] @@ -4053,9 +4591,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.13" +version = "0.23.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2dabaac7466917e566adb06783a81ca48944c6898a1b08b9374106dd671f4c8" +checksum = "47796c98c480fce5406ef69d1c76378375492c3b0a0de587be0c1d9feb12f395" dependencies = [ "log", "once_cell", @@ -4068,15 +4606,26 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.8.0" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" +dependencies = [ + "openssl-probe", + "rustls-pemfile 1.0.4", + "schannel", + "security-framework 2.11.1", +] + +[[package]] +name = "rustls-native-certs" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a" +checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3" dependencies = [ "openssl-probe", - "rustls-pemfile 2.1.2", "rustls-pki-types", "schannel", - "security-framework", + "security-framework 3.2.0", ] [[package]] @@ -4090,19 +4639,21 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "2.1.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29993a25686778eb88d4189742cd713c9bce943bc54251a33509dc63cbacf73d" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" dependencies = [ - "base64 0.22.0", "rustls-pki-types", ] [[package]] name = "rustls-pki-types" -version = "1.9.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e696e35370c65c9c541198af4543ccd580cf17fc25d8e05c5a242b202488c55" +checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" +dependencies = [ + "web-time", +] [[package]] name = "rustls-webpki" @@ -4127,15 +4678,15 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.15" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80af6f9131f277a45a3fba6ce8e2258037bb0477a67e610d3c1fe046ab31de47" +checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" [[package]] name = "ryu" -version = "1.0.17" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" +checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" [[package]] name = "same-file" @@ -4148,11 +4699,11 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.23" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" +checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -4173,12 +4724,25 @@ dependencies = [ [[package]] name = "security-framework" -version = "2.10.0" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags 2.8.0", + "core-foundation 0.9.4", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework" +version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "770452e37cad93e0a50d5abc3990d2bc351c36d0328f86cefec2f2fb206eaef6" +checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316" dependencies = [ - "bitflags 1.3.2", - "core-foundation", + "bitflags 2.8.0", + "core-foundation 0.10.0", "core-foundation-sys", "libc", "security-framework-sys", @@ -4186,9 +4750,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.10.0" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41f3cc463c0ef97e11c3461a9d3787412d30e8e7eb907c79180c4a57bf7c04ef" +checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32" dependencies = [ "core-foundation-sys", "libc", @@ -4196,9 +4760,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.22" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca" +checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03" dependencies = [ "serde", ] @@ -4211,32 +4775,33 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.198" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9846a40c979031340571da2545a4e5b7c4163bdae79b301d5f86d03979451fcc" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.198" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e88edab869b01783ba905e7d0153f9fc1a6505a96e4ad3018011eedb838566d9" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] name = "serde_json" -version = "1.0.116" +version = "1.0.138" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e17db7126d17feb94eb3fad46bf1a96b034e8aacbc2e775fe81505f8b0b2813" +checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" dependencies = [ - "indexmap 2.5.0", + "indexmap 2.7.1", "itoa", + "memchr", "ryu", "serde", ] @@ -4249,7 +4814,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] @@ -4286,9 +4851,9 @@ dependencies = [ [[package]] name = "sha1_smol" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012" +checksum = "bbfa15b3dddfee50a0fff136974b3e1bde555604ba463834a7eb7deb6417705d" [[package]] name = "sha2" @@ -4333,7 +4898,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34db1a06d485c9142248b7a054f034b349b212551f3dfd19c94d45a754a217cd" dependencies = [ "libc", - "mio 1.0.2", + "mio", "signal-hook", ] @@ -4352,11 +4917,17 @@ version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "siphasher" -version = "0.3.11" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" [[package]] name = "slab" @@ -4391,7 +4962,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] @@ -4402,9 +4973,9 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "socket2" -version = "0.5.6" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871" +checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8" dependencies = [ "libc", "windows-sys 0.52.0", @@ -4418,11 +4989,12 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] name = "sqlparser" -version = "0.53.0" +version = "0.54.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8" +checksum = "c66e3b7374ad4a6af849b08b3e7a6eda0edbd82f0fd59b57e22671bf16979899" dependencies = [ "log", + "recursive", "sqlparser_derive", ] @@ -4434,27 +5006,33 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "stacker" -version = "0.1.15" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c886bd4480155fd3ef527d45e9ac8dd7118a898a46530b7b94c3e21866259fce" +checksum = "1d08feb8f695b465baed819b03c128dc23f57a694510ab1f06c77f763975685e" dependencies = [ "cc", "cfg-if", "libc", "psm", - "winapi", + "windows-sys 0.59.0", ] [[package]] name = "static-files" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64712ea1e3e140010e1d9605872ba205afa2ab5bd38191cc6ebd248ae1f6a06b" +checksum = "4e8590e848e1c53be9258210bcd4a8f4118e08988f03a4e2d63b62e4ad9f7ced" dependencies = [ "change-detection", "mime_guess", @@ -4469,9 +5047,9 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "strsim" -version = "0.10.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "structopt-derive" @@ -4486,30 +5064,11 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "strum" -version = "0.26.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29" - -[[package]] -name = "strum_macros" -version = "0.26.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6cf59daf282c0a494ba14fd21610a0325f9f90ec9d1231dea26bcb1d696c946" -dependencies = [ - "heck 0.4.1", - "proc-macro2", - "quote", - "rustversion", - "syn 2.0.87", -] - [[package]] name = "subtle" -version = "2.5.0" +version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" @@ -4524,9 +5083,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.87" +version = "2.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" dependencies = [ "proc-macro2", "quote", @@ -4541,13 +5100,24 @@ checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" [[package]] name = "sync_wrapper" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" dependencies = [ "futures-core", ] +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", +] + [[package]] name = "sysinfo" version = "0.31.4" @@ -4569,7 +5139,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" dependencies = [ "bitflags 1.3.2", - "core-foundation", + "core-foundation 0.9.4", "system-configuration-sys", ] @@ -4585,54 +5155,56 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.10.1" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +checksum = "38c246215d7d24f48ae091a2902398798e05d978b24315d6efbc00ede9a8bb91" dependencies = [ "cfg-if", - "fastrand 2.0.2", - "rustix 0.38.34", - "windows-sys 0.52.0", + "fastrand 2.3.0", + "getrandom 0.3.1", + "once_cell", + "rustix", + "windows-sys 0.59.0", ] [[package]] name = "thiserror" -version = "1.0.64" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl 1.0.64", + "thiserror-impl 1.0.69", ] [[package]] name = "thiserror" -version = "2.0.9" +version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f072643fd0190df67a8bab670c20ef5d8737177d6ac6b2e9a236cb096206b2cc" +checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" dependencies = [ - "thiserror-impl 2.0.9", + "thiserror-impl 2.0.11", ] [[package]] name = "thiserror-impl" -version = "1.0.64" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] name = "thiserror-impl" -version = "2.0.9" +version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b50fa271071aae2e6ee85f842e2e28ba8cd2c5fb67f11fcb1fd70b276f9e7d4" +checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] @@ -4658,9 +5230,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.36" +version = "0.3.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" +checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21" dependencies = [ "deranged", "itoa", @@ -4681,9 +5253,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" +checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de" dependencies = [ "num-conv", "time-core", @@ -4698,11 +5270,21 @@ dependencies = [ "crunchy", ] +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "022db8904dfa342efe721985167e9fcd16c29b226db4397ed752a761cfce81e8" dependencies = [ "tinyvec_macros", ] @@ -4715,32 +5297,31 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.37.0" +version = "1.43.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" +checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e" dependencies = [ "backtrace", "bytes", "libc", - "mio 0.8.11", - "num_cpus", + "mio", "parking_lot", "pin-project-lite", "signal-hook-registry", "socket2", "tokio-macros", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] name = "tokio-macros" -version = "2.2.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] @@ -4766,20 +5347,19 @@ dependencies = [ [[package]] name = "tokio-rustls" -version = "0.26.0" +version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" +checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" dependencies = [ - "rustls 0.23.13", - "rustls-pki-types", + "rustls 0.23.23", "tokio", ] [[package]] name = "tokio-stream" -version = "0.1.16" +version = "0.1.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f4e6ce100d0eb49a2734f8c0812bcd324cf357d21810932c5df6b96ef2b86f1" +checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047" dependencies = [ "futures-core", "pin-project-lite", @@ -4788,23 +5368,22 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.10" +version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" +checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078" dependencies = [ "bytes", "futures-core", "futures-sink", "pin-project-lite", "tokio", - "tracing", ] [[package]] name = "toml" -version = "0.8.12" +version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9dd1545e8208b4a5af1aa9bbd0b4cf7e9ea08fabc5d0a5c67fcaafa17433aa3" +checksum = "cd87a5cdd6ffab733b2f74bc4fd7ee5fff6634124999ac278c35fc78c6120148" dependencies = [ "serde", "serde_spanned", @@ -4823,11 +5402,11 @@ dependencies = [ [[package]] name = "toml_edit" -version = "0.22.22" +version = "0.22.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" +checksum = "17b4795ff5edd201c7cd6dca065ae59972ce77d1b80fa0a84d94950ece7d1474" dependencies = [ - "indexmap 2.5.0", + "indexmap 2.7.1", "serde", "serde_spanned", "toml_datetime", @@ -4843,25 +5422,25 @@ dependencies = [ "async-stream", "async-trait", "axum", - "base64 0.22.0", + "base64 0.22.1", "bytes", "flate2", - "h2 0.4.5", - "http 1.1.0", - "http-body 1.0.0", + "h2 0.4.7", + "http 1.2.0", + "http-body 1.0.1", "http-body-util", - "hyper 1.4.1", + "hyper 1.6.0", "hyper-timeout", "hyper-util", "percent-encoding", "pin-project", "prost", - "rustls-pemfile 2.1.2", + "rustls-pemfile 2.2.0", "socket2", "tokio", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.1", "tokio-stream", - "tower", + "tower 0.4.13", "tower-layer", "tower-service", "tracing", @@ -4874,10 +5453,10 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5299dd20801ad736dccb4a5ea0da7376e59cd98f213bf1c3d478cf53f4834b58" dependencies = [ - "base64 0.22.0", + "base64 0.22.1", "bytes", - "http 1.1.0", - "http-body 1.0.0", + "http 1.2.0", + "http-body 1.0.1", "http-body-util", "pin-project", "tokio-stream", @@ -4899,7 +5478,7 @@ dependencies = [ "indexmap 1.9.3", "pin-project", "pin-project-lite", - "rand", + "rand 0.8.5", "slab", "tokio", "tokio-util", @@ -4908,16 +5487,31 @@ dependencies = [ "tracing", ] +[[package]] +name = "tower" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper 1.0.2", + "tokio", + "tower-layer", + "tower-service", +] + [[package]] name = "tower-http" version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" dependencies = [ - "bitflags 2.5.0", + "bitflags 2.8.0", "bytes", - "http 1.1.0", - "http-body 1.0.0", + "http 1.2.0", + "http-body 1.0.1", "http-body-util", "pin-project-lite", "tower-layer", @@ -4930,9 +5524,9 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697" dependencies = [ - "bitflags 2.5.0", + "bitflags 2.8.0", "bytes", - "http 1.1.0", + "http 1.2.0", "pin-project-lite", "tower-layer", "tower-service", @@ -4946,9 +5540,9 @@ checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" [[package]] name = "tower-service" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" @@ -4970,7 +5564,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] @@ -4996,9 +5590,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.18" +version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" +checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" dependencies = [ "matchers", "nu-ansi-term", @@ -5036,57 +5630,53 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "ulid" -version = "1.1.2" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34778c17965aa2a08913b57e1f34db9b4a63f5de31768b55bf20d2795f921259" +checksum = "ab82fc73182c29b02e2926a6df32f2241dbadb5cfc111fd595515b3598f46bb3" dependencies = [ - "getrandom", - "rand", + "rand 0.9.0", "serde", "web-time", ] [[package]] name = "unicase" -version = "2.7.0" +version = "2.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89" -dependencies = [ - "version_check", -] +checksum = "75b844d17643ee918803943289730bec8aac480150456169e647ed0b576ba539" [[package]] name = "unicode-bidi" -version = "0.3.15" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" +checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" [[package]] name = "unicode-ident" -version = "1.0.13" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" +checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" [[package]] name = "unicode-normalization" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" dependencies = [ "tinyvec", ] [[package]] name = "unicode-segmentation" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" -version = "0.1.11" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" [[package]] name = "untrusted" @@ -5096,57 +5686,76 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "uptime_lib" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4e71ddbefed856d5881821d6ada4e606bbb91fd332296963ed596e2ad2100f3" +checksum = "9e64b558561f12a171bbea5325c3f24f129db371adee1d7ae93b6e310bd69192" dependencies = [ "libc", - "thiserror 1.0.64", - "windows 0.52.0", + "thiserror 1.0.69", + "windows 0.57.0", ] [[package]] name = "ureq" -version = "2.9.6" +version = "2.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11f214ce18d8b2cbe84ed3aa6486ed3f5b285cf8d8fbdbce9f3f767a724adc35" +checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" dependencies = [ - "base64 0.21.7", + "base64 0.22.1", "flate2", "log", "once_cell", - "rustls 0.22.4", + "rustls 0.23.23", "rustls-pki-types", - "rustls-webpki 0.102.8", "url", - "webpki-roots 0.26.1", + "webpki-roots 0.26.8", ] [[package]] name = "url" -version = "2.5.0" +version = "2.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" dependencies = [ "form_urlencoded", - "idna", + "idna 1.0.3", "percent-encoding", "serde", ] +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "utf8parse" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.8.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a183cf7feeba97b4dd1c0d46788634f6221d87fa961b305bed08c851829efcc0" +checksum = "ced87ca4be083373936a67f8de945faa23b6b42384bd5b64434850802c6dccd0" dependencies = [ - "getrandom", + "getrandom 0.3.1", + "js-sys", + "wasm-bindgen", ] [[package]] @@ -5155,7 +5764,7 @@ version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db79c75af171630a3148bd3e6d7c4f42b6a9a014c2945bc5ed0020cbb8d9478e" dependencies = [ - "idna", + "idna 0.5.0", "once_cell", "regex", "serde", @@ -5167,29 +5776,29 @@ dependencies = [ [[package]] name = "validator_derive" -version = "0.18.1" +version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55591299b7007f551ed1eb79a684af7672c19c3193fb9e0a31936987bb2438ec" +checksum = "df0bcf92720c40105ac4b2dda2a4ea3aa717d4d6a862cc217da653a4bd5c6b10" dependencies = [ "darling", "once_cell", "proc-macro-error", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] name = "valuable" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" [[package]] name = "vergen" -version = "8.3.1" +version = "8.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e27d6bdd219887a9eadd19e1c34f32e47fa332301184935c6d9bca26f3cca525" +checksum = "2990d9ea5967266ea0ccf413a4aa5c42a93dbcfda9cb49a97de6931726b12566" dependencies = [ "anyhow", "cargo_metadata", @@ -5201,15 +5810,21 @@ dependencies = [ [[package]] name = "version_check" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "vsimd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" [[package]] name = "waker-fn" -version = "1.1.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3c4517f54858c779bbcbf228f4fca63d121bf85fbecb2dc578cdf4a39395690" +checksum = "317211a0dc0ceedd78fb2ca9a44aed3d7b9b26f81870d485c07122b4350673b7" [[package]] name = "walkdir" @@ -5242,48 +5857,59 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasi" +version = "0.13.3+wasi-0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" +dependencies = [ + "wit-bindgen-rt", +] + [[package]] name = "wasm-bindgen" -version = "0.2.92" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" dependencies = [ "cfg-if", + "once_cell", + "rustversion", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.92" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" dependencies = [ "bumpalo", "log", - "once_cell", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.42" +version = "0.4.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" +checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61" dependencies = [ "cfg-if", "js-sys", + "once_cell", "wasm-bindgen", "web-sys", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.92" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -5291,28 +5917,31 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.92" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.92" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] [[package]] name = "wasm-streams" -version = "0.4.0" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" dependencies = [ "futures-util", "js-sys", @@ -5323,9 +5952,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.69" +version = "0.3.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" dependencies = [ "js-sys", "wasm-bindgen", @@ -5349,9 +5978,9 @@ checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1" [[package]] name = "webpki-roots" -version = "0.26.1" +version = "0.26.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009" +checksum = "2210b291f7ea53617fbafcc4939f10914214ec15aace5ba62293a668f322c5c9" dependencies = [ "rustls-pki-types", ] @@ -5374,11 +6003,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.7" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134306a13c5647ad6453e8deaec55d3a44d6021970129e6188735e74bf546697" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -5436,7 +6065,7 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] @@ -5447,7 +6076,7 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", ] [[package]] @@ -5489,15 +6118,6 @@ dependencies = [ "windows-targets 0.52.6", ] -[[package]] -name = "windows-sys" -version = "0.45.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" -dependencies = [ - "windows-targets 0.42.2", -] - [[package]] name = "windows-sys" version = "0.48.0" @@ -5517,18 +6137,12 @@ dependencies = [ ] [[package]] -name = "windows-targets" -version = "0.42.2" +name = "windows-sys" +version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", + "windows-targets 0.52.6", ] [[package]] @@ -5562,12 +6176,6 @@ dependencies = [ "windows_x86_64_msvc 0.52.6", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" - [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" @@ -5580,12 +6188,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" -[[package]] -name = "windows_aarch64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" - [[package]] name = "windows_aarch64_msvc" version = "0.48.5" @@ -5598,12 +6200,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" -[[package]] -name = "windows_i686_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" - [[package]] name = "windows_i686_gnu" version = "0.48.5" @@ -5622,12 +6218,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" -[[package]] -name = "windows_i686_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" - [[package]] name = "windows_i686_msvc" version = "0.48.5" @@ -5640,12 +6230,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" -[[package]] -name = "windows_x86_64_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" - [[package]] name = "windows_x86_64_gnu" version = "0.48.5" @@ -5658,12 +6242,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" - [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" @@ -5676,12 +6254,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" -[[package]] -name = "windows_x86_64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" - [[package]] name = "windows_x86_64_msvc" version = "0.48.5" @@ -5696,9 +6268,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.20" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b" +checksum = "59690dea168f2198d1a3b0cac23b8063efcd11012f10ae4698f284808c8ef603" dependencies = [ "memchr", ] @@ -5713,11 +6285,38 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "wit-bindgen-rt" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" +dependencies = [ + "bitflags 2.8.0", +] + +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + +[[package]] +name = "xmlparser" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" + [[package]] name = "xxhash-rust" -version = "0.8.10" +version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "927da81e25be1e1a2901d59b81b37dd2efd1fc9c9345a55007f09bf5a2d3ee03" +checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" [[package]] name = "xz2" @@ -5728,46 +6327,134 @@ dependencies = [ "lzma-sys", ] +[[package]] +name = "yoke" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", + "synstructure", +] + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive 0.7.35", +] + [[package]] name = "zerocopy" -version = "0.7.32" +version = "0.8.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa91407dacce3a68c56de03abe2760159582b846c6a4acd2f456618087f12713" +dependencies = [ + "zerocopy-derive 0.8.17", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ - "zerocopy-derive", + "proc-macro2", + "quote", + "syn 2.0.98", ] [[package]] name = "zerocopy-derive" -version = "0.7.32" +version = "0.8.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06718a168365cad3d5ff0bb133aad346959a2074bd4a85c121255a11304a8626" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", +] + +[[package]] +name = "zerofrom" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.98", + "synstructure", ] [[package]] name = "zeroize" -version = "1.7.0" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" + +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", +] [[package]] name = "zip" -version = "2.2.0" +version = "2.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc5e4288ea4057ae23afc69a4472434a87a2495cafce6632fd1c4ec9f5cf3494" +checksum = "ae9c1ea7b3a5e1f4b922ff856a129881167511563dc219869afe3787fc0c1a45" dependencies = [ "arbitrary", "crc32fast", "crossbeam-utils", "displaydoc", "flate2", - "indexmap 2.5.0", + "indexmap 2.7.1", "memchr", - "thiserror 1.0.64", + "thiserror 2.0.11", "zopfli", ] @@ -5787,27 +6474,27 @@ dependencies = [ [[package]] name = "zstd" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d789b1514203a1120ad2429eae43a7bd32b90976a7bb8a05f7ec02fa88cc23a" +checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "7.1.0" +version = "7.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd99b45c6bc03a018c8b8a86025678c87e55526064e38f9df301989dce7ec0a" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" dependencies = [ "zstd-sys", ] [[package]] name = "zstd-sys" -version = "2.0.10+zstd.1.5.6" +version = "2.0.13+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c253a4914af5bafc8fa8c86ee400827e83cf6ec01195ec1f1ed8441bf00d65aa" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" dependencies = [ "cc", "pkg-config", diff --git a/Cargo.toml b/Cargo.toml index f92e51d8e..e3b952695 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,15 +9,31 @@ build = "build.rs" [dependencies] # Arrow and DataFusion ecosystem -arrow-array = { version = "53.0.0" } -arrow-flight = { version = "53.0.0", features = ["tls"] } -arrow-ipc = { version = "53.0.0", features = ["zstd"] } -arrow-json = "53.0.0" -arrow-schema = { version = "53.0.0", features = ["serde"] } -arrow-select = "53.0.0" -datafusion = "44.0.0" -object_store = { version = "0.11.2", features = ["cloud", "aws", "azure"] } -parquet = "53.0.0" +arrow = { version = "54.1.0", features = [ + "prettyprint", + "chrono-tz", +] } +arrow-array = { version = "54.1.0" } +arrow-json = "54.1.0" +arrow-select = "54.1.0" +arrow-buffer = { version = "54.1.0", default-features = false } +arrow-flight = { version = "54.1.0", features = [ + "flight-sql-experimental", + "tls" +] } +arrow-ipc = { version = "54.1.0", default-features = false, features = [ + "zstd", +] } +arrow-ord = { version = "54.1.0", default-features = false } +arrow-schema = { version = "54.1.0", features = ["serde"] } +datafusion = {git = "https://github.com/apache/datafusion", branch = "main" } +object_store = { version = "0.11.2", features = ["cloud", "aws", "azure", "gcp", "http"] } +parking_lot = "0.12" +parquet = { version = "54.1.0", default-features = false, features = [ + "arrow", + "async", + "object_store", +] } # Web server and HTTP-related actix-cors = "0.7.0" @@ -117,6 +133,9 @@ thiserror = "2.0.0" ulid = { version = "1.0", features = ["serde"] } xxhash-rust = { version = "0.8", features = ["xxh3"] } structopt-derive = "0.4.18" +dirs = "6.0.0" +aws-config = "1.5.16" +aws-credential-types = "1.2.1" [build-dependencies] cargo_toml = "0.20.1" @@ -129,7 +148,7 @@ zip = { version = "2.2.0", default-features = false, features = ["deflate"] } [dev-dependencies] rstest = "0.23.0" -arrow = "53.0.0" + [package.metadata.parseable_ui] assets-url = "https://github.com/parseablehq/console/releases/download/v0.9.18/build.zip" diff --git a/src/handlers/airplane.rs b/src/handlers/airplane.rs index 5edfdba21..abde7b8bd 100644 --- a/src/handlers/airplane.rs +++ b/src/handlers/airplane.rs @@ -1,348 +1,348 @@ -/* - * Parseable Server (C) 2022 - 2024 Parseable, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - */ - -use arrow_array::RecordBatch; -use arrow_flight::flight_service_server::FlightServiceServer; -use arrow_flight::PollInfo; -use arrow_schema::ArrowError; - -use datafusion::common::tree_node::TreeNode; -use serde_json::json; -use std::net::SocketAddr; -use std::time::Instant; -use tonic::codec::CompressionEncoding; -use tracing::{error, info}; - -use futures_util::{Future, TryFutureExt}; - -use tonic::transport::{Identity, Server, ServerTlsConfig}; -use tonic_web::GrpcWebLayer; - -use crate::handlers::http::cluster::get_ingestor_info; -use crate::handlers::http::query::{into_query, update_schema_when_distributed}; -use crate::handlers::livetail::cross_origin_config; -use crate::metrics::QUERY_EXECUTE_TIME; -use crate::option::CONFIG; -use crate::query::{TableScanVisitor, QUERY_SESSION}; -use crate::utils::arrow::flight::{ - append_temporary_events, get_query_from_ticket, into_flight_data, run_do_get_rpc, - send_to_ingester, -}; -use crate::utils::time::TimeRange; -use crate::utils::user_auth_for_query; -use arrow_flight::{ - flight_service_server::FlightService, Action, ActionType, Criteria, Empty, FlightData, - FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, PutResult, SchemaAsIpc, - SchemaResult, Ticket, -}; -use arrow_ipc::writer::IpcWriteOptions; -use futures::stream; -use tonic::{Request, Response, Status, Streaming}; - -use crate::handlers::livetail::extract_session_key; -use crate::metadata::STREAM_INFO; -use crate::rbac; -use crate::rbac::Users; - -#[derive(Clone, Debug)] -pub struct AirServiceImpl {} - -#[tonic::async_trait] -impl FlightService for AirServiceImpl { - type HandshakeStream = stream::BoxStream<'static, Result>; - type ListFlightsStream = stream::BoxStream<'static, Result>; - type DoGetStream = stream::BoxStream<'static, Result>; - type DoPutStream = stream::BoxStream<'static, Result>; - type DoActionStream = stream::BoxStream<'static, Result>; - type ListActionsStream = stream::BoxStream<'static, Result>; - type DoExchangeStream = stream::BoxStream<'static, Result>; - - async fn handshake( - &self, - _request: Request>, - ) -> Result, Status> { - Err(Status::unimplemented( - "handshake is disabled in favour of direct authentication and authorization", - )) - } - - /// list_flights is an operation that allows a client - /// to query a Flight server for information - /// about available datasets or "flights" that the server can provide. - async fn list_flights( - &self, - _request: Request, - ) -> Result, Status> { - Err(Status::unimplemented("Implement list_flights")) - } - - async fn poll_flight_info( - &self, - _request: Request, - ) -> Result, Status> { - Err(Status::unimplemented("Implement poll_flight_info")) - } - - async fn get_flight_info( - &self, - _request: Request, - ) -> Result, Status> { - Err(Status::unimplemented("Implement get_flight_info")) - } - - async fn get_schema( - &self, - request: Request, - ) -> Result, Status> { - let table_name = request.into_inner().path; - let table_name = table_name[0].clone(); - - let schema = STREAM_INFO - .schema(&table_name) - .map_err(|err| Status::failed_precondition(err.to_string()))?; - - let options = IpcWriteOptions::default(); - let schema_result = SchemaAsIpc::new(&schema, &options) - .try_into() - .map_err(|err: ArrowError| Status::internal(err.to_string()))?; - - Ok(Response::new(schema_result)) - } - - async fn do_get(&self, req: Request) -> Result, Status> { - let key = extract_session_key(req.metadata())?; - - let ticket = get_query_from_ticket(&req)?; - - info!("query requested to airplane: {:?}", ticket); - - // get the query session_state - let session_state = QUERY_SESSION.state(); - - // get the logical plan and extract the table name - let raw_logical_plan = session_state - .create_logical_plan(&ticket.query) - .await - .map_err(|err| { - error!("Datafusion Error: Failed to create logical plan: {}", err); - Status::internal("Failed to create logical plan") - })?; - - let time_range = TimeRange::parse_human_time(&ticket.start_time, &ticket.end_time) - .map_err(|e| Status::internal(e.to_string()))?; - // create a visitor to extract the table name - let mut visitor = TableScanVisitor::default(); - let _ = raw_logical_plan.visit(&mut visitor); - - let streams = visitor.into_inner(); - - let stream_name = streams - .first() - .ok_or_else(|| Status::aborted("Malformed SQL Provided, Table Name Not Found"))? - .to_owned(); - - update_schema_when_distributed(&streams) - .await - .map_err(|err| Status::internal(err.to_string()))?; - - // map payload to query - let query = into_query(&ticket, &session_state, time_range) - .await - .map_err(|_| Status::internal("Failed to parse query"))?; - - let event = if send_to_ingester( - query.time_range.start.timestamp_millis(), - query.time_range.end.timestamp_millis(), - ) { - let sql = format!("select * from {}", &stream_name); - let start_time = ticket.start_time.clone(); - let end_time = ticket.end_time.clone(); - let out_ticket = json!({ - "query": sql, - "startTime": start_time, - "endTime": end_time - }) - .to_string(); - - let ingester_metadatas = get_ingestor_info() - .await - .map_err(|err| Status::failed_precondition(err.to_string()))?; - let mut minute_result: Vec = vec![]; - - for im in ingester_metadatas { - if let Ok(mut batches) = run_do_get_rpc(im, out_ticket.clone()).await { - minute_result.append(&mut batches); - } - } - let mr = minute_result.iter().collect::>(); - let event = append_temporary_events(&stream_name, mr).await?; - Some(event) - } else { - None - }; - - // try authorize - match Users.authorize(key.clone(), rbac::role::Action::Query, None, None) { - rbac::Response::Authorized => (), - rbac::Response::UnAuthorized => { - return Err(Status::permission_denied( - "user is not authorized to access this resource", - )) - } - rbac::Response::ReloadRequired => { - return Err(Status::unauthenticated("reload required")) - } - } - - let permissions = Users.get_permissions(&key); - - user_auth_for_query(&permissions, &streams).map_err(|_| { - Status::permission_denied("User Does not have permission to access this") - })?; - let time = Instant::now(); - let (records, _) = query - .execute(stream_name.clone()) - .await - .map_err(|err| Status::internal(err.to_string()))?; - - /* - * INFO: No returning the schema with the data. - * kept it in case it needs to be sent in the future. - - let schemas = results - .iter() - .map(|batch| batch.schema()) - .map(|s| s.as_ref().clone()) - .collect::>(); - let schema = Schema::try_merge(schemas).map_err(|err| Status::internal(err.to_string()))?; - */ - let out = into_flight_data(records); - - if let Some(event) = event { - event.clear(&stream_name); - } - - let time = time.elapsed().as_secs_f64(); - QUERY_EXECUTE_TIME - .with_label_values(&[&format!("flight-query-{}", stream_name)]) - .observe(time); - - out - } - - async fn do_put( - &self, - _request: Request>, - ) -> Result, Status> { - Err(Status::unimplemented( - "do_put not implemented because we are only using flight for querying", - )) - } - - async fn do_action( - &self, - _request: Request, - ) -> Result, Status> { - Err(Status::unimplemented( - "do_action not implemented because we are only using flight for querying", - )) - } - - async fn list_actions( - &self, - _request: Request, - ) -> Result, Status> { - Err(Status::unimplemented( - "list_actions not implemented because we are only using flight for querying", - )) - } - - async fn do_exchange( - &self, - _request: Request>, - ) -> Result, Status> { - Err(Status::unimplemented( - "do_exchange not implemented because we are only using flight for querying", - )) - } -} - -pub fn server() -> impl Future>> + Send { - let mut addr: SocketAddr = CONFIG - .options - .address - .parse() - .unwrap_or_else(|err| panic!("{}, failed to parse `{}` as a socket address. Please set the environment variable `P_ADDR` to `:` without the scheme (e.g., 192.168.1.1:8000). Please refer to the documentation: https://logg.ing/env for more details.", -CONFIG.options.address, err)); - addr.set_port(CONFIG.options.flight_port); - - let service = AirServiceImpl {}; - - let svc = FlightServiceServer::new(service) - .max_encoding_message_size(usize::MAX) - .max_decoding_message_size(usize::MAX) - .send_compressed(CompressionEncoding::Zstd) - .accept_compressed(CompressionEncoding::Zstd); - - let cors = cross_origin_config(); - - let identity = match (&CONFIG.options.tls_cert_path, &CONFIG.options.tls_key_path) { - (Some(cert), Some(key)) => { - match (std::fs::read_to_string(cert), std::fs::read_to_string(key)) { - (Ok(cert_file), Ok(key_file)) => { - let identity = Identity::from_pem(cert_file, key_file); - Some(identity) - } - _ => None, - } - } - (_, _) => None, - }; - - let config = identity.map(|id| ServerTlsConfig::new().identity(id)); - - // rust is treating closures as different types - let err_map_fn = |err| Box::new(err) as Box; - - // match on config to decide if we want to use tls or not - match config { - Some(config) => { - let server = match Server::builder().tls_config(config) { - Ok(server) => server, - Err(_) => Server::builder(), - }; - - server - .max_frame_size(16 * 1024 * 1024 - 2) - .accept_http1(true) - .layer(cors) - .layer(GrpcWebLayer::new()) - .add_service(svc) - .serve(addr) - .map_err(err_map_fn) - } - None => Server::builder() - .max_frame_size(16 * 1024 * 1024 - 2) - .accept_http1(true) - .layer(cors) - .layer(GrpcWebLayer::new()) - .add_service(svc) - .serve(addr) - .map_err(err_map_fn), - } -} +// /* +// * Parseable Server (C) 2022 - 2024 Parseable, Inc. +// * +// * This program is free software: you can redistribute it and/or modify +// * it under the terms of the GNU Affero General Public License as +// * published by the Free Software Foundation, either version 3 of the +// * License, or (at your option) any later version. +// * +// * This program is distributed in the hope that it will be useful, +// * but WITHOUT ANY WARRANTY; without even the implied warranty of +// * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// * GNU Affero General Public License for more details. +// * +// * You should have received a copy of the GNU Affero General Public License +// * along with this program. If not, see . +// * +// */ + +// use arrow_array::RecordBatch; +// use arrow_flight::flight_service_server::FlightServiceServer; +// use arrow_flight::PollInfo; +// use arrow_schema::ArrowError; + +// use datafusion::common::tree_node::TreeNode; +// use serde_json::json; +// use std::net::SocketAddr; +// use std::time::Instant; +// use tonic::codec::CompressionEncoding; +// use tracing::{error, info}; + +// use futures_util::{Future, TryFutureExt}; + +// use tonic::transport::{Identity, Server, ServerTlsConfig}; +// use tonic_web::GrpcWebLayer; + +// use crate::handlers::http::cluster::get_ingestor_info; +// use crate::handlers::http::query::{into_query, update_schema_when_distributed}; +// use crate::handlers::livetail::cross_origin_config; +// use crate::metrics::QUERY_EXECUTE_TIME; +// use crate::option::CONFIG; +// use crate::query::{TableScanVisitor, QUERY_SESSION}; +// use crate::utils::arrow::flight::{ +// append_temporary_events, get_query_from_ticket, into_flight_data, run_do_get_rpc, +// send_to_ingester, +// }; +// use crate::utils::time::TimeRange; +// use crate::utils::user_auth_for_query; +// use arrow_flight::{ +// flight_service_server::FlightService, Action, ActionType, Criteria, Empty, FlightData, +// FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, PutResult, SchemaAsIpc, +// SchemaResult, Ticket, +// }; +// use arrow_ipc::writer::IpcWriteOptions; +// use futures::stream; +// use tonic::{Request, Response, Status, Streaming}; + +// use crate::handlers::livetail::extract_session_key; +// use crate::metadata::STREAM_INFO; +// use crate::rbac; +// use crate::rbac::Users; + +// #[derive(Clone, Debug)] +// pub struct AirServiceImpl {} + +// #[tonic::async_trait] +// impl FlightService for AirServiceImpl { +// type HandshakeStream = stream::BoxStream<'static, Result>; +// type ListFlightsStream = stream::BoxStream<'static, Result>; +// type DoGetStream = stream::BoxStream<'static, Result>; +// type DoPutStream = stream::BoxStream<'static, Result>; +// type DoActionStream = stream::BoxStream<'static, Result>; +// type ListActionsStream = stream::BoxStream<'static, Result>; +// type DoExchangeStream = stream::BoxStream<'static, Result>; + +// async fn handshake( +// &self, +// _request: Request>, +// ) -> Result, Status> { +// Err(Status::unimplemented( +// "handshake is disabled in favour of direct authentication and authorization", +// )) +// } + +// /// list_flights is an operation that allows a client +// /// to query a Flight server for information +// /// about available datasets or "flights" that the server can provide. +// async fn list_flights( +// &self, +// _request: Request, +// ) -> Result, Status> { +// Err(Status::unimplemented("Implement list_flights")) +// } + +// async fn poll_flight_info( +// &self, +// _request: Request, +// ) -> Result, Status> { +// Err(Status::unimplemented("Implement poll_flight_info")) +// } + +// async fn get_flight_info( +// &self, +// _request: Request, +// ) -> Result, Status> { +// Err(Status::unimplemented("Implement get_flight_info")) +// } + +// async fn get_schema( +// &self, +// request: Request, +// ) -> Result, Status> { +// let table_name = request.into_inner().path; +// let table_name = table_name[0].clone(); + +// let schema = STREAM_INFO +// .schema(&table_name) +// .map_err(|err| Status::failed_precondition(err.to_string()))?; + +// let options = IpcWriteOptions::default(); +// let schema_result = SchemaAsIpc::new(&schema, &options) +// .try_into() +// .map_err(|err: ArrowError| Status::internal(err.to_string()))?; + +// Ok(Response::new(schema_result)) +// } + +// async fn do_get(&self, req: Request) -> Result, Status> { +// let key = extract_session_key(req.metadata())?; + +// let ticket = get_query_from_ticket(&req)?; + +// info!("query requested to airplane: {:?}", ticket); + +// // get the query session_state +// let session_state = QUERY_SESSION.state(); + +// // get the logical plan and extract the table name +// let raw_logical_plan = session_state +// .create_logical_plan(&ticket.query) +// .await +// .map_err(|err| { +// error!("Datafusion Error: Failed to create logical plan: {}", err); +// Status::internal("Failed to create logical plan") +// })?; + +// let time_range = TimeRange::parse_human_time(&ticket.start_time, &ticket.end_time) +// .map_err(|e| Status::internal(e.to_string()))?; +// // create a visitor to extract the table name +// let mut visitor = TableScanVisitor::default(); +// let _ = raw_logical_plan.visit(&mut visitor); + +// let streams = visitor.into_inner(); + +// let stream_name = streams +// .first() +// .ok_or_else(|| Status::aborted("Malformed SQL Provided, Table Name Not Found"))? +// .to_owned(); + +// update_schema_when_distributed(&streams) +// .await +// .map_err(|err| Status::internal(err.to_string()))?; + +// // map payload to query +// let query = into_query(&ticket, &session_state, time_range) +// .await +// .map_err(|_| Status::internal("Failed to parse query"))?; + +// let event = if send_to_ingester( +// query.time_range.start.timestamp_millis(), +// query.time_range.end.timestamp_millis(), +// ) { +// let sql = format!("select * from {}", &stream_name); +// let start_time = ticket.start_time.clone(); +// let end_time = ticket.end_time.clone(); +// let out_ticket = json!({ +// "query": sql, +// "startTime": start_time, +// "endTime": end_time +// }) +// .to_string(); + +// let ingester_metadatas = get_ingestor_info() +// .await +// .map_err(|err| Status::failed_precondition(err.to_string()))?; +// let mut minute_result: Vec = vec![]; + +// for im in ingester_metadatas { +// if let Ok(mut batches) = run_do_get_rpc(im, out_ticket.clone()).await { +// minute_result.append(&mut batches); +// } +// } +// let mr = minute_result.iter().collect::>(); +// let event = append_temporary_events(&stream_name, mr).await?; +// Some(event) +// } else { +// None +// }; + +// // try authorize +// match Users.authorize(key.clone(), rbac::role::Action::Query, None, None) { +// rbac::Response::Authorized => (), +// rbac::Response::UnAuthorized => { +// return Err(Status::permission_denied( +// "user is not authorized to access this resource", +// )) +// } +// rbac::Response::ReloadRequired => { +// return Err(Status::unauthenticated("reload required")) +// } +// } + +// let permissions = Users.get_permissions(&key); + +// user_auth_for_query(&permissions, &streams).map_err(|_| { +// Status::permission_denied("User Does not have permission to access this") +// })?; +// let time = Instant::now(); +// let (records, _) = query +// .execute(stream_name.clone()) +// .await +// .map_err(|err| Status::internal(err.to_string()))?; + +// /* +// * INFO: No returning the schema with the data. +// * kept it in case it needs to be sent in the future. + +// let schemas = results +// .iter() +// .map(|batch| batch.schema()) +// .map(|s| s.as_ref().clone()) +// .collect::>(); +// let schema = Schema::try_merge(schemas).map_err(|err| Status::internal(err.to_string()))?; +// */ +// let out = into_flight_data(records); + +// if let Some(event) = event { +// event.clear(&stream_name); +// } + +// let time = time.elapsed().as_secs_f64(); +// QUERY_EXECUTE_TIME +// .with_label_values(&[&format!("flight-query-{}", stream_name)]) +// .observe(time); + +// out +// } + +// async fn do_put( +// &self, +// _request: Request>, +// ) -> Result, Status> { +// Err(Status::unimplemented( +// "do_put not implemented because we are only using flight for querying", +// )) +// } + +// async fn do_action( +// &self, +// _request: Request, +// ) -> Result, Status> { +// Err(Status::unimplemented( +// "do_action not implemented because we are only using flight for querying", +// )) +// } + +// async fn list_actions( +// &self, +// _request: Request, +// ) -> Result, Status> { +// Err(Status::unimplemented( +// "list_actions not implemented because we are only using flight for querying", +// )) +// } + +// async fn do_exchange( +// &self, +// _request: Request>, +// ) -> Result, Status> { +// Err(Status::unimplemented( +// "do_exchange not implemented because we are only using flight for querying", +// )) +// } +// } + +// pub fn server() -> impl Future>> + Send { +// let mut addr: SocketAddr = CONFIG +// .options +// .address +// .parse() +// .unwrap_or_else(|err| panic!("{}, failed to parse `{}` as a socket address. Please set the environment variable `P_ADDR` to `:` without the scheme (e.g., 192.168.1.1:8000). Please refer to the documentation: https://logg.ing/env for more details.", +// CONFIG.options.address, err)); +// addr.set_port(CONFIG.options.flight_port); + +// let service = AirServiceImpl {}; + +// let svc = FlightServiceServer::new(service) +// .max_encoding_message_size(usize::MAX) +// .max_decoding_message_size(usize::MAX) +// .send_compressed(CompressionEncoding::Zstd) +// .accept_compressed(CompressionEncoding::Zstd); + +// let cors = cross_origin_config(); + +// let identity = match (&CONFIG.options.tls_cert_path, &CONFIG.options.tls_key_path) { +// (Some(cert), Some(key)) => { +// match (std::fs::read_to_string(cert), std::fs::read_to_string(key)) { +// (Ok(cert_file), Ok(key_file)) => { +// let identity = Identity::from_pem(cert_file, key_file); +// Some(identity) +// } +// _ => None, +// } +// } +// (_, _) => None, +// }; + +// let config = identity.map(|id| ServerTlsConfig::new().identity(id)); + +// // rust is treating closures as different types +// let err_map_fn = |err| Box::new(err) as Box; + +// // match on config to decide if we want to use tls or not +// match config { +// Some(config) => { +// let server = match Server::builder().tls_config(config) { +// Ok(server) => server, +// Err(_) => Server::builder(), +// }; + +// server +// .max_frame_size(16 * 1024 * 1024 - 2) +// .accept_http1(true) +// .layer(cors) +// .layer(GrpcWebLayer::new()) +// .add_service(svc) +// .serve(addr) +// .map_err(err_map_fn) +// } +// None => Server::builder() +// .max_frame_size(16 * 1024 * 1024 - 2) +// .accept_http1(true) +// .layer(cors) +// .layer(GrpcWebLayer::new()) +// .add_service(svc) +// .serve(addr) +// .map_err(err_map_fn), +// } +// } diff --git a/src/handlers/http/modal/ingest_server.rs b/src/handlers/http/modal/ingest_server.rs index 215f79478..c2e22c398 100644 --- a/src/handlers/http/modal/ingest_server.rs +++ b/src/handlers/http/modal/ingest_server.rs @@ -24,7 +24,6 @@ use super::IngestorMetadata; use super::OpenIdClient; use super::ParseableServer; use crate::analytics; -use crate::handlers::airplane; use crate::handlers::http::ingest; use crate::handlers::http::logstream; use crate::handlers::http::middleware::DisAllowRootUser; @@ -109,7 +108,7 @@ impl ParseableServer for IngestServer { let (mut remote_sync_handler, mut remote_sync_outbox, mut remote_sync_inbox) = sync::object_store_sync().await; - tokio::spawn(airplane::server()); + // tokio::spawn(airplane::server()); // set the ingestor metadata set_ingestor_metadata().await?; diff --git a/src/handlers/http/modal/query_server.rs b/src/handlers/http/modal/query_server.rs index 4c77e226c..93d368ccf 100644 --- a/src/handlers/http/modal/query_server.rs +++ b/src/handlers/http/modal/query_server.rs @@ -18,7 +18,6 @@ use crate::alerts::ALERTS; use crate::correlation::CORRELATIONS; -use crate::handlers::airplane; use crate::handlers::http::base_path; use crate::handlers::http::cluster::{self, init_cluster_metrics_schedular}; use crate::handlers::http::logstream::create_internal_stream_if_not_exists; @@ -134,7 +133,7 @@ impl ParseableServer for QueryServer { let (mut remote_sync_handler, mut remote_sync_outbox, mut remote_sync_inbox) = sync::object_store_sync().await; - tokio::spawn(airplane::server()); + // tokio::spawn(airplane::server()); let app = self.start(shutdown_rx, prometheus, CONFIG.options.openid()); tokio::pin!(app); diff --git a/src/handlers/http/modal/server.rs b/src/handlers/http/modal/server.rs index c7ee3963f..2904b9b95 100644 --- a/src/handlers/http/modal/server.rs +++ b/src/handlers/http/modal/server.rs @@ -136,7 +136,7 @@ impl ParseableServer for Server { } tokio::spawn(handlers::livetail::server()); - tokio::spawn(handlers::airplane::server()); + // tokio::spawn(handlers::airplane::server()); let app = self.start(shutdown_rx, prometheus, CONFIG.options.openid()); diff --git a/src/query/catalog.rs b/src/query/catalog.rs new file mode 100644 index 000000000..bbe2b2ce7 --- /dev/null +++ b/src/query/catalog.rs @@ -0,0 +1,212 @@ +use std::any::Any; +use std::sync::{Arc, Weak}; + +use datafusion::catalog::{CatalogProvider, CatalogProviderList, SchemaProvider}; + +use datafusion::common::plan_datafusion_err; +use datafusion::datasource::listing::ListingTableUrl; +use datafusion::datasource::TableProvider; +use datafusion::error::Result; +use datafusion::execution::context::SessionState; +use datafusion::execution::session_state::SessionStateBuilder; + +use async_trait::async_trait; +use dirs::home_dir; +use parking_lot::RwLock; + +use super::object_storage::{get_object_store, AwsOptions, GcpOptions}; + +/// Wraps another catalog, automatically register require object stores for the file locations +#[derive(Debug)] +pub struct DynamicObjectStoreCatalog { + inner: Arc, + state: Weak>, +} + +impl DynamicObjectStoreCatalog { + pub fn new( + inner: Arc, + state: Weak>, + ) -> Self { + Self { inner, state } + } +} + +impl CatalogProviderList for DynamicObjectStoreCatalog { + fn as_any(&self) -> &dyn Any { + self + } + + fn register_catalog( + &self, + name: String, + catalog: Arc, + ) -> Option> { + self.inner.register_catalog(name, catalog) + } + + fn catalog_names(&self) -> Vec { + self.inner.catalog_names() + } + + fn catalog(&self, name: &str) -> Option> { + let state = self.state.clone(); + self.inner.catalog(name).map(|catalog| { + Arc::new(DynamicObjectStoreCatalogProvider::new(catalog, state)) as _ + }) + } +} + +/// Wraps another catalog provider +#[derive(Debug)] +struct DynamicObjectStoreCatalogProvider { + inner: Arc, + state: Weak>, +} + +impl DynamicObjectStoreCatalogProvider { + pub fn new( + inner: Arc, + state: Weak>, + ) -> Self { + Self { inner, state } + } +} + +impl CatalogProvider for DynamicObjectStoreCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + self.inner.schema_names() + } + + fn schema(&self, name: &str) -> Option> { + let state = self.state.clone(); + self.inner.schema(name).map(|schema| { + Arc::new(DynamicObjectStoreSchemaProvider::new(schema, state)) as _ + }) + } + + fn register_schema( + &self, + name: &str, + schema: Arc, + ) -> Result>> { + self.inner.register_schema(name, schema) + } +} + +/// Wraps another schema provider. [DynamicObjectStoreSchemaProvider] is responsible for registering the required +/// object stores for the file locations. +#[derive(Debug)] +struct DynamicObjectStoreSchemaProvider { + inner: Arc, + state: Weak>, +} + +impl DynamicObjectStoreSchemaProvider { + pub fn new( + inner: Arc, + state: Weak>, + ) -> Self { + Self { inner, state } + } +} + +#[async_trait] +impl SchemaProvider for DynamicObjectStoreSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + self.inner.table_names() + } + + fn register_table( + &self, + name: String, + table: Arc, + ) -> Result>> { + self.inner.register_table(name, table) + } + + async fn table(&self, name: &str) -> Result>> { + let inner_table = self.inner.table(name).await; + if inner_table.is_ok() { + if let Some(inner_table) = inner_table? { + return Ok(Some(inner_table)); + } + } + + // if the inner schema provider didn't have a table by + // that name, try to treat it as a listing table + let mut state = self + .state + .upgrade() + .ok_or_else(|| plan_datafusion_err!("locking error"))? + .read() + .clone(); + let mut builder = SessionStateBuilder::from(state.clone()); + let optimized_name = substitute_tilde(name.to_owned()); + let table_url = ListingTableUrl::parse(optimized_name.as_str())?; + let scheme = table_url.scheme(); + let url = table_url.as_ref(); + + // If the store is already registered for this URL then `get_store` + // will return `Ok` which means we don't need to register it again. However, + // if `get_store` returns an `Err` then it means the corresponding store is + // not registered yet and we need to register it + match state.runtime_env().object_store_registry.get_store(url) { + Ok(_) => { /*Nothing to do here, store for this URL is already registered*/ } + Err(_) => { + // Register the store for this URL. Here we don't have access + // to any command options so the only choice is to use an empty collection + match scheme { + "s3" | "oss" | "cos" => { + if let Some(table_options) = builder.table_options() { + table_options.extensions.insert(AwsOptions::default()) + } + } + "gs" | "gcs" => { + if let Some(table_options) = builder.table_options() { + table_options.extensions.insert(GcpOptions::default()) + } + } + _ => {} + }; + state = builder.build(); + let store = get_object_store( + &state, + table_url.scheme(), + url, + &state.default_table_options(), + ) + .await?; + state.runtime_env().register_object_store(url, store); + } + } + self.inner.table(name).await + } + + fn deregister_table(&self, name: &str) -> Result>> { + self.inner.deregister_table(name) + } + + fn table_exist(&self, name: &str) -> bool { + self.inner.table_exist(name) + } +} + +pub fn substitute_tilde(cur: String) -> String { + if let Some(usr_dir_path) = home_dir() { + if let Some(usr_dir) = usr_dir_path.to_str() { + if cur.starts_with('~') && !usr_dir.is_empty() { + return cur.replacen('~', usr_dir, 1); + } + } + } + cur +} \ No newline at end of file diff --git a/src/query/functions.rs b/src/query/functions.rs new file mode 100644 index 000000000..9c84987a5 --- /dev/null +++ b/src/query/functions.rs @@ -0,0 +1,461 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Functions that are query-able and searchable via the `\h` command + +use std::fmt; +use std::fs::File; +use std::str::FromStr; +use std::sync::Arc; + +use arrow::array::{Int64Array, StringArray}; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow::record_batch::RecordBatch; +use datafusion::catalog::{Session, TableFunctionImpl}; +use datafusion::common::{plan_err, Column}; +use datafusion::datasource::TableProvider; +use datafusion::error::Result; +use datafusion::logical_expr::Expr; +use datafusion::physical_plan::memory::MemorySourceConfig; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::scalar::ScalarValue; + +use async_trait::async_trait; +use parquet::basic::ConvertedType; +use parquet::data_type::{ByteArray, FixedLenByteArray}; +use parquet::file::reader::FileReader; +use parquet::file::serialized_reader::SerializedFileReader; +use parquet::file::statistics::Statistics; + +#[derive(Debug)] +pub enum Function { + Select, + Explain, + Show, + CreateTable, + CreateTableAs, + Insert, + DropTable, +} + +// const ALL_FUNCTIONS: [Function; 7] = [ +// Function::CreateTable, +// Function::CreateTableAs, +// Function::DropTable, +// Function::Explain, +// Function::Insert, +// Function::Select, +// Function::Show, +// ]; + +impl Function { +// pub fn function_details(&self) -> Result<&str> { +// let details = match self { +// Function::Select => { +// r#" +// Command: SELECT +// Description: retrieve rows from a table or view +// Syntax: +// SELECT [ ALL | DISTINCT [ ON ( expression [, ...] ) ] ] +// [ * | expression [ [ AS ] output_name ] [, ...] ] +// [ FROM from_item [, ...] ] +// [ WHERE condition ] +// [ GROUP BY [ ALL | DISTINCT ] grouping_element [, ...] ] +// [ HAVING condition ] +// [ WINDOW window_name AS ( window_definition ) [, ...] ] +// [ { UNION | INTERSECT | EXCEPT } [ ALL | DISTINCT ] select ] +// [ ORDER BY expression [ ASC | DESC | USING operator ] [ NULLS { FIRST | LAST } ] [, ...] ] +// [ LIMIT { count | ALL } ] +// [ OFFSET start [ ROW | ROWS ] ] + +// where from_item can be one of: + +// [ ONLY ] table_name [ * ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] +// [ TABLESAMPLE sampling_method ( argument [, ...] ) [ REPEATABLE ( seed ) ] ] +// [ LATERAL ] ( select ) [ AS ] alias [ ( column_alias [, ...] ) ] +// with_query_name [ [ AS ] alias [ ( column_alias [, ...] ) ] ] +// [ LATERAL ] function_name ( [ argument [, ...] ] ) +// [ WITH ORDINALITY ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] +// [ LATERAL ] function_name ( [ argument [, ...] ] ) [ AS ] alias ( column_definition [, ...] ) +// [ LATERAL ] function_name ( [ argument [, ...] ] ) AS ( column_definition [, ...] ) +// [ LATERAL ] ROWS FROM( function_name ( [ argument [, ...] ] ) [ AS ( column_definition [, ...] ) ] [, ...] ) +// [ WITH ORDINALITY ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] +// from_item [ NATURAL ] join_type from_item [ ON join_condition | USING ( join_column [, ...] ) [ AS join_using_alias ] ] + +// and grouping_element can be one of: + +// ( ) +// expression +// ( expression [, ...] ) + +// and with_query is: + +// with_query_name [ ( column_name [, ...] ) ] AS [ [ NOT ] MATERIALIZED ] ( select | values | insert | update | delete ) + +// TABLE [ ONLY ] table_name [ * ]"# +// } +// Function::Explain => { +// r#" +// Command: EXPLAIN +// Description: show the execution plan of a statement +// Syntax: +// EXPLAIN [ ANALYZE ] statement +// "# +// } +// Function::Show => { +// r#" +// Command: SHOW +// Description: show the value of a run-time parameter +// Syntax: +// SHOW name +// "# +// } +// Function::CreateTable => { +// r#" +// Command: CREATE TABLE +// Description: define a new table +// Syntax: +// CREATE [ EXTERNAL ] TABLE table_name ( [ +// { column_name data_type } +// [, ... ] +// ] ) +// "# +// } +// Function::CreateTableAs => { +// r#" +// Command: CREATE TABLE AS +// Description: define a new table from the results of a query +// Syntax: +// CREATE TABLE table_name +// [ (column_name [, ...] ) ] +// AS query +// [ WITH [ NO ] DATA ] +// "# +// } +// Function::Insert => { +// r#" +// Command: INSERT +// Description: create new rows in a table +// Syntax: +// INSERT INTO table_name [ ( column_name [, ...] ) ] +// { VALUES ( { expression } [, ...] ) [, ...] } +// "# +// } +// Function::DropTable => { +// r#" +// Command: DROP TABLE +// Description: remove a table +// Syntax: +// DROP TABLE [ IF EXISTS ] name [, ...] +// "# +// } +// }; +// Ok(details) +// } +} + +impl FromStr for Function { + type Err = (); + + fn from_str(s: &str) -> Result { + Ok(match s.trim().to_uppercase().as_str() { + "SELECT" => Self::Select, + "EXPLAIN" => Self::Explain, + "SHOW" => Self::Show, + "CREATE TABLE" => Self::CreateTable, + "CREATE TABLE AS" => Self::CreateTableAs, + "INSERT" => Self::Insert, + "DROP TABLE" => Self::DropTable, + _ => return Err(()), + }) + } +} + +impl fmt::Display for Function { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Function::Select => write!(f, "SELECT"), + Function::Explain => write!(f, "EXPLAIN"), + Function::Show => write!(f, "SHOW"), + Function::CreateTable => write!(f, "CREATE TABLE"), + Function::CreateTableAs => write!(f, "CREATE TABLE AS"), + Function::Insert => write!(f, "INSERT"), + Function::DropTable => write!(f, "DROP TABLE"), + } + } +} + +// pub fn display_all_functions() -> Result<()> { +// println!("Available help:"); +// let array = StringArray::from( +// ALL_FUNCTIONS +// .iter() +// .map(|f| format!("{}", f)) +// .collect::>(), +// ); +// let schema = Schema::new(vec![Field::new("Function", DataType::Utf8, false)]); +// let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)])?; +// println!("{}", pretty_format_batches(&[batch]).unwrap()); +// Ok(()) +// } + +/// PARQUET_META table function +#[derive(Debug)] +struct ParquetMetadataTable { + schema: SchemaRef, + batch: RecordBatch, +} + +#[async_trait] +impl TableProvider for ParquetMetadataTable { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn schema(&self) -> arrow::datatypes::SchemaRef { + self.schema.clone() + } + + fn table_type(&self) -> datafusion::logical_expr::TableType { + datafusion::logical_expr::TableType::Base + } + + async fn scan( + &self, + _state: &dyn Session, + projection: Option<&Vec>, + _filters: &[Expr], + _limit: Option, + ) -> Result> { + Ok(MemorySourceConfig::try_new_exec( + &[vec![self.batch.clone()]], + TableProvider::schema(self), + projection.cloned(), + )?) + } +} + +fn convert_parquet_statistics( + value: &Statistics, + converted_type: ConvertedType, +) -> (Option, Option) { + match (value, converted_type) { + (Statistics::Boolean(val), _) => ( + val.min_opt().map(|v| v.to_string()), + val.max_opt().map(|v| v.to_string()), + ), + (Statistics::Int32(val), _) => ( + val.min_opt().map(|v| v.to_string()), + val.max_opt().map(|v| v.to_string()), + ), + (Statistics::Int64(val), _) => ( + val.min_opt().map(|v| v.to_string()), + val.max_opt().map(|v| v.to_string()), + ), + (Statistics::Int96(val), _) => ( + val.min_opt().map(|v| v.to_string()), + val.max_opt().map(|v| v.to_string()), + ), + (Statistics::Float(val), _) => ( + val.min_opt().map(|v| v.to_string()), + val.max_opt().map(|v| v.to_string()), + ), + (Statistics::Double(val), _) => ( + val.min_opt().map(|v| v.to_string()), + val.max_opt().map(|v| v.to_string()), + ), + (Statistics::ByteArray(val), ConvertedType::UTF8) => ( + byte_array_to_string(val.min_opt()), + byte_array_to_string(val.max_opt()), + ), + (Statistics::ByteArray(val), _) => ( + val.min_opt().map(|v| v.to_string()), + val.max_opt().map(|v| v.to_string()), + ), + (Statistics::FixedLenByteArray(val), ConvertedType::UTF8) => ( + fixed_len_byte_array_to_string(val.min_opt()), + fixed_len_byte_array_to_string(val.max_opt()), + ), + (Statistics::FixedLenByteArray(val), _) => ( + val.min_opt().map(|v| v.to_string()), + val.max_opt().map(|v| v.to_string()), + ), + } +} + +/// Convert to a string if it has utf8 encoding, otherwise print bytes directly +fn byte_array_to_string(val: Option<&ByteArray>) -> Option { + val.map(|v| { + v.as_utf8() + .map(|s| s.to_string()) + .unwrap_or_else(|_e| v.to_string()) + }) +} + +/// Convert to a string if it has utf8 encoding, otherwise print bytes directly +fn fixed_len_byte_array_to_string(val: Option<&FixedLenByteArray>) -> Option { + val.map(|v| { + v.as_utf8() + .map(|s| s.to_string()) + .unwrap_or_else(|_e| v.to_string()) + }) +} + +#[derive(Debug)] +pub struct ParquetMetadataFunc {} + +impl TableFunctionImpl for ParquetMetadataFunc { + fn call(&self, exprs: &[Expr]) -> Result> { + let filename = match exprs.first() { + Some(Expr::Literal(ScalarValue::Utf8(Some(s)))) => s, // single quote: parquet_metadata('x.parquet') + Some(Expr::Column(Column { name, .. })) => name, // double quote: parquet_metadata("x.parquet") + _ => { + return plan_err!( + "parquet_metadata requires string argument as its input" + ); + } + }; + + let file = File::open(filename.clone())?; + let reader = SerializedFileReader::new(file).map_err(datafusion::error::DataFusionError::from)?; + let metadata = reader.metadata(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("filename", DataType::Utf8, true), + Field::new("row_group_id", DataType::Int64, true), + Field::new("row_group_num_rows", DataType::Int64, true), + Field::new("row_group_num_columns", DataType::Int64, true), + Field::new("row_group_bytes", DataType::Int64, true), + Field::new("column_id", DataType::Int64, true), + Field::new("file_offset", DataType::Int64, true), + Field::new("num_values", DataType::Int64, true), + Field::new("path_in_schema", DataType::Utf8, true), + Field::new("type", DataType::Utf8, true), + Field::new("stats_min", DataType::Utf8, true), + Field::new("stats_max", DataType::Utf8, true), + Field::new("stats_null_count", DataType::Int64, true), + Field::new("stats_distinct_count", DataType::Int64, true), + Field::new("stats_min_value", DataType::Utf8, true), + Field::new("stats_max_value", DataType::Utf8, true), + Field::new("compression", DataType::Utf8, true), + Field::new("encodings", DataType::Utf8, true), + Field::new("index_page_offset", DataType::Int64, true), + Field::new("dictionary_page_offset", DataType::Int64, true), + Field::new("data_page_offset", DataType::Int64, true), + Field::new("total_compressed_size", DataType::Int64, true), + Field::new("total_uncompressed_size", DataType::Int64, true), + ])); + + // construct record batch from metadata + let mut filename_arr = vec![]; + let mut row_group_id_arr = vec![]; + let mut row_group_num_rows_arr = vec![]; + let mut row_group_num_columns_arr = vec![]; + let mut row_group_bytes_arr = vec![]; + let mut column_id_arr = vec![]; + let mut file_offset_arr = vec![]; + let mut num_values_arr = vec![]; + let mut path_in_schema_arr = vec![]; + let mut type_arr = vec![]; + let mut stats_min_arr = vec![]; + let mut stats_max_arr = vec![]; + let mut stats_null_count_arr = vec![]; + let mut stats_distinct_count_arr = vec![]; + let mut stats_min_value_arr = vec![]; + let mut stats_max_value_arr = vec![]; + let mut compression_arr = vec![]; + let mut encodings_arr = vec![]; + let mut index_page_offset_arr = vec![]; + let mut dictionary_page_offset_arr = vec![]; + let mut data_page_offset_arr = vec![]; + let mut total_compressed_size_arr = vec![]; + let mut total_uncompressed_size_arr = vec![]; + for (rg_idx, row_group) in metadata.row_groups().iter().enumerate() { + for (col_idx, column) in row_group.columns().iter().enumerate() { + filename_arr.push(filename.clone()); + row_group_id_arr.push(rg_idx as i64); + row_group_num_rows_arr.push(row_group.num_rows()); + row_group_num_columns_arr.push(row_group.num_columns() as i64); + row_group_bytes_arr.push(row_group.total_byte_size()); + column_id_arr.push(col_idx as i64); + file_offset_arr.push(column.file_offset()); + num_values_arr.push(column.num_values()); + path_in_schema_arr.push(column.column_path().to_string()); + type_arr.push(column.column_type().to_string()); + let converted_type = column.column_descr().converted_type(); + + if let Some(s) = column.statistics() { + let (min_val, max_val) = + convert_parquet_statistics(s, converted_type); + stats_min_arr.push(min_val.clone()); + stats_max_arr.push(max_val.clone()); + stats_null_count_arr.push(s.null_count_opt().map(|c| c as i64)); + stats_distinct_count_arr + .push(s.distinct_count_opt().map(|c| c as i64)); + stats_min_value_arr.push(min_val); + stats_max_value_arr.push(max_val); + } else { + stats_min_arr.push(None); + stats_max_arr.push(None); + stats_null_count_arr.push(None); + stats_distinct_count_arr.push(None); + stats_min_value_arr.push(None); + stats_max_value_arr.push(None); + }; + compression_arr.push(format!("{:?}", column.compression())); + encodings_arr.push(format!("{:?}", column.encodings())); + index_page_offset_arr.push(column.index_page_offset()); + dictionary_page_offset_arr.push(column.dictionary_page_offset()); + data_page_offset_arr.push(column.data_page_offset()); + total_compressed_size_arr.push(column.compressed_size()); + total_uncompressed_size_arr.push(column.uncompressed_size()); + } + } + + let rb = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(filename_arr)), + Arc::new(Int64Array::from(row_group_id_arr)), + Arc::new(Int64Array::from(row_group_num_rows_arr)), + Arc::new(Int64Array::from(row_group_num_columns_arr)), + Arc::new(Int64Array::from(row_group_bytes_arr)), + Arc::new(Int64Array::from(column_id_arr)), + Arc::new(Int64Array::from(file_offset_arr)), + Arc::new(Int64Array::from(num_values_arr)), + Arc::new(StringArray::from(path_in_schema_arr)), + Arc::new(StringArray::from(type_arr)), + Arc::new(StringArray::from(stats_min_arr)), + Arc::new(StringArray::from(stats_max_arr)), + Arc::new(Int64Array::from(stats_null_count_arr)), + Arc::new(Int64Array::from(stats_distinct_count_arr)), + Arc::new(StringArray::from(stats_min_value_arr)), + Arc::new(StringArray::from(stats_max_value_arr)), + Arc::new(StringArray::from(compression_arr)), + Arc::new(StringArray::from(encodings_arr)), + Arc::new(Int64Array::from(index_page_offset_arr)), + Arc::new(Int64Array::from(dictionary_page_offset_arr)), + Arc::new(Int64Array::from(data_page_offset_arr)), + Arc::new(Int64Array::from(total_compressed_size_arr)), + Arc::new(Int64Array::from(total_uncompressed_size_arr)), + ], + )?; + + let parquet_metadata = ParquetMetadataTable { schema, batch: rb }; + Ok(Arc::new(parquet_metadata)) + } +} diff --git a/src/query/mod.rs b/src/query/mod.rs index fbdf1d0c4..8da7cb41b 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -19,8 +19,11 @@ mod filter_optimizer; mod listing_table_builder; pub mod stream_schema_provider; +pub mod catalog; +pub mod functions; +pub mod object_storage; -use arrow_schema::DataType; +use catalog::DynamicObjectStoreCatalog; use chrono::NaiveDateTime; use chrono::{DateTime, Duration, Utc}; use datafusion::arrow::record_batch::RecordBatch; @@ -29,12 +32,14 @@ use datafusion::common::exec_datafusion_err; use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeVisitor}; use datafusion::error::{DataFusionError, Result}; use datafusion::execution::disk_manager::DiskManagerConfig; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; use datafusion::execution::SessionStateBuilder; use datafusion::logical_expr::expr::Alias; use datafusion::logical_expr::{ Aggregate, Explain, Filter, LogicalPlan, PlanType, Projection, ToStringifiedPlan, }; use datafusion::prelude::*; +use functions::ParquetMetadataFunc; use itertools::Itertools; use once_cell::sync::Lazy; use relative_path::RelativePathBuf; @@ -658,6 +663,8 @@ impl AllQueries { } pub async fn run() -> Result<()> { + let rt_config = RuntimeEnvBuilder::new(); + let runtime_env = rt_config.build().unwrap(); println!("Running benchmarks"); let queries_path: PathBuf = ["/home","ubuntu", "clickbench", "queries.sql"].iter().collect(); let queries = AllQueries::try_new(queries_path.as_path())?; @@ -676,15 +683,27 @@ pub async fn run() -> Result<()> { config.options_mut().execution.parquet.pushdown_filters = true; config.options_mut().execution.parquet.reorder_filters = true; config.options_mut().execution.use_row_number_estimates_to_optimize_partitioning = true; - let ctx = SessionContext::new_with_config(config); + // enable dynamic file query + let ctx = + SessionContext::new_with_config_rt(config, Arc::new(runtime_env)) + .enable_url_table(); + ctx.refresh_catalogs().await?; + // install dynamic catalog provider that can register required object stores + ctx.register_catalog_list(Arc::new(DynamicObjectStoreCatalog::new( + ctx.state().catalog_list().clone(), + ctx.state_weak_ref(), + ))); + // register `parquet_metadata` table function to get metadata from parquet files + ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {})); + register_hits(&ctx).await?; for query_id in query_range { let sql = queries.get_query(query_id)?; - println!("Q{query_id}: {sql}"); - + let plan = ctx.state().create_logical_plan(sql).await?; let start = Instant::now(); - let _ = ctx.sql(sql).await?.collect().await?; + let df = ctx.execute_logical_plan(plan).await?; + let _ = df.collect().await?; let elapsed = start.elapsed().as_secs_f64(); println!("Q{query_id} took {elapsed} seconds"); @@ -694,14 +713,14 @@ pub async fn run() -> Result<()> { /// Registers the `hits.parquet` as a table named `hits` async fn register_hits(ctx: &SessionContext) -> Result<()> { - let mut options: ParquetReadOptions<'_> = Default::default(); - options.table_partition_cols = vec!["date", "hour", "minute"] - .iter() - .map(|s| (s.to_string(), DataType::Utf8)) - .collect(); - let schema = STREAM_INFO.schema("hits").unwrap(); - options.schema = Some(&schema); - let path: PathBuf = ["/home", "ubuntu", "parseable", "data", "hits"].iter().collect(); + let options: ParquetReadOptions<'_> = Default::default(); + // options.table_partition_cols = vec!["date", "hour", "minute"] + // .iter() + // .map(|s| (s.to_string(), DataType::Utf8)) + // .collect(); + // let schema = STREAM_INFO.schema("hits").unwrap(); + // options.schema = Some(&schema); + let path: PathBuf = ["/home", "ubuntu", "clickbench", "hits.parquet"].iter().collect(); let path = path.as_os_str().to_str().unwrap(); println!("Registering 'hits' as {path}"); ctx.register_parquet("hits", path, options) diff --git a/src/query/object_storage.rs b/src/query/object_storage.rs new file mode 100644 index 000000000..3a540eb94 --- /dev/null +++ b/src/query/object_storage.rs @@ -0,0 +1,456 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::fmt::{Debug, Display}; +use std::sync::Arc; + +use datafusion::common::config::{ + ConfigEntry, ConfigExtension, ConfigField, ExtensionOptions, TableOptions, Visit, +}; +use datafusion::common::{config_err, exec_datafusion_err, exec_err}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::execution::context::SessionState; + +use async_trait::async_trait; +use aws_config::BehaviorVersion; +use aws_credential_types::provider::ProvideCredentials; +use object_store::aws::{AmazonS3Builder, AwsCredential}; +use object_store::gcp::GoogleCloudStorageBuilder; +use object_store::http::HttpBuilder; +use object_store::{ClientOptions, CredentialProvider, ObjectStore}; +use url::Url; + +pub async fn get_s3_object_store_builder( + url: &Url, + aws_options: &AwsOptions, +) -> Result { + let AwsOptions { + access_key_id, + secret_access_key, + session_token, + region, + endpoint, + allow_http, + } = aws_options; + + let bucket_name = get_bucket_name(url)?; + let mut builder = AmazonS3Builder::from_env().with_bucket_name(bucket_name); + + if let (Some(access_key_id), Some(secret_access_key)) = + (access_key_id, secret_access_key) + { + builder = builder + .with_access_key_id(access_key_id) + .with_secret_access_key(secret_access_key); + + if let Some(session_token) = session_token { + builder = builder.with_token(session_token); + } + } else { + let config = aws_config::defaults(BehaviorVersion::latest()).load().await; + if let Some(region) = config.region() { + builder = builder.with_region(region.to_string()); + } + + let credentials = config + .credentials_provider() + .ok_or_else(|| { + DataFusionError::ObjectStore(object_store::Error::Generic { + store: "S3", + source: "Failed to get S3 credentials from the environment".into(), + }) + })? + .clone(); + + let credentials = Arc::new(S3CredentialProvider { credentials }); + builder = builder.with_credentials(credentials); + } + + if let Some(region) = region { + builder = builder.with_region(region); + } + + if let Some(endpoint) = endpoint { + // Make a nicer error if the user hasn't allowed http and the endpoint + // is http as the default message is "URL scheme is not allowed" + if let Ok(endpoint_url) = Url::try_from(endpoint.as_str()) { + if !matches!(allow_http, Some(true)) && endpoint_url.scheme() == "http" { + return config_err!( + "Invalid endpoint: {endpoint}. \ + HTTP is not allowed for S3 endpoints. \ + To allow HTTP, set 'aws.allow_http' to true" + ); + } + } + + builder = builder.with_endpoint(endpoint); + } + + if let Some(allow_http) = allow_http { + builder = builder.with_allow_http(*allow_http); + } + + Ok(builder) +} + +#[derive(Debug)] +struct S3CredentialProvider { + credentials: aws_credential_types::provider::SharedCredentialsProvider, +} + +#[async_trait] +impl CredentialProvider for S3CredentialProvider { + type Credential = AwsCredential; + + async fn get_credential(&self) -> object_store::Result> { + let creds = self.credentials.provide_credentials().await.map_err(|e| { + object_store::Error::Generic { + store: "S3", + source: Box::new(e), + } + })?; + Ok(Arc::new(AwsCredential { + key_id: creds.access_key_id().to_string(), + secret_key: creds.secret_access_key().to_string(), + token: creds.session_token().map(ToString::to_string), + })) + } +} + +pub fn get_oss_object_store_builder( + url: &Url, + aws_options: &AwsOptions, +) -> Result { + get_object_store_builder(url, aws_options, true) +} + +pub fn get_cos_object_store_builder( + url: &Url, + aws_options: &AwsOptions, +) -> Result { + get_object_store_builder(url, aws_options, false) +} + +fn get_object_store_builder( + url: &Url, + aws_options: &AwsOptions, + virtual_hosted_style_request: bool, +) -> Result { + let bucket_name = get_bucket_name(url)?; + let mut builder = AmazonS3Builder::from_env() + .with_virtual_hosted_style_request(virtual_hosted_style_request) + .with_bucket_name(bucket_name) + // oss/cos don't care about the "region" field + .with_region("do_not_care"); + + if let (Some(access_key_id), Some(secret_access_key)) = + (&aws_options.access_key_id, &aws_options.secret_access_key) + { + builder = builder + .with_access_key_id(access_key_id) + .with_secret_access_key(secret_access_key); + } + + if let Some(endpoint) = &aws_options.endpoint { + builder = builder.with_endpoint(endpoint); + } + + Ok(builder) +} + +pub fn get_gcs_object_store_builder( + url: &Url, + gs_options: &GcpOptions, +) -> Result { + let bucket_name = get_bucket_name(url)?; + let mut builder = GoogleCloudStorageBuilder::from_env().with_bucket_name(bucket_name); + + if let Some(service_account_path) = &gs_options.service_account_path { + builder = builder.with_service_account_path(service_account_path); + } + + if let Some(service_account_key) = &gs_options.service_account_key { + builder = builder.with_service_account_key(service_account_key); + } + + if let Some(application_credentials_path) = &gs_options.application_credentials_path { + builder = builder.with_application_credentials(application_credentials_path); + } + + Ok(builder) +} + +fn get_bucket_name(url: &Url) -> Result<&str> { + url.host_str().ok_or_else(|| { + DataFusionError::Execution(format!( + "Not able to parse bucket name from url: {}", + url.as_str() + )) + }) +} + +/// This struct encapsulates AWS options one uses when setting up object storage. +#[derive(Default, Debug, Clone)] +pub struct AwsOptions { + /// Access Key ID + pub access_key_id: Option, + /// Secret Access Key + pub secret_access_key: Option, + /// Session token + pub session_token: Option, + /// AWS Region + pub region: Option, + /// OSS or COS Endpoint + pub endpoint: Option, + /// Allow HTTP (otherwise will always use https) + pub allow_http: Option, +} + +impl ExtensionOptions for AwsOptions { + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn cloned(&self) -> Box { + Box::new(self.clone()) + } + + fn set(&mut self, key: &str, value: &str) -> Result<()> { + let (_key, aws_key) = key.split_once('.').unwrap_or((key, "")); + let (key, rem) = aws_key.split_once('.').unwrap_or((aws_key, "")); + match key { + "access_key_id" => { + self.access_key_id.set(rem, value)?; + } + "secret_access_key" => { + self.secret_access_key.set(rem, value)?; + } + "session_token" => { + self.session_token.set(rem, value)?; + } + "region" => { + self.region.set(rem, value)?; + } + "oss" | "cos" | "endpoint" => { + self.endpoint.set(rem, value)?; + } + "allow_http" => { + self.allow_http.set(rem, value)?; + } + _ => { + return config_err!("Config value \"{}\" not found on AwsOptions", rem); + } + } + Ok(()) + } + + fn entries(&self) -> Vec { + struct Visitor(Vec); + + impl Visit for Visitor { + fn some( + &mut self, + key: &str, + value: V, + description: &'static str, + ) { + self.0.push(ConfigEntry { + key: key.to_string(), + value: Some(value.to_string()), + description, + }) + } + + fn none(&mut self, key: &str, description: &'static str) { + self.0.push(ConfigEntry { + key: key.to_string(), + value: None, + description, + }) + } + } + + let mut v = Visitor(vec![]); + self.access_key_id.visit(&mut v, "access_key_id", ""); + self.secret_access_key + .visit(&mut v, "secret_access_key", ""); + self.session_token.visit(&mut v, "session_token", ""); + self.region.visit(&mut v, "region", ""); + self.endpoint.visit(&mut v, "endpoint", ""); + self.allow_http.visit(&mut v, "allow_http", ""); + v.0 + } +} + +impl ConfigExtension for AwsOptions { + const PREFIX: &'static str = "aws"; +} + +/// This struct encapsulates GCP options one uses when setting up object storage. +#[derive(Debug, Clone, Default)] +pub struct GcpOptions { + /// Service account path + pub service_account_path: Option, + /// Service account key + pub service_account_key: Option, + /// Application credentials path + pub application_credentials_path: Option, +} + +impl ExtensionOptions for GcpOptions { + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn cloned(&self) -> Box { + Box::new(self.clone()) + } + + fn set(&mut self, key: &str, value: &str) -> Result<()> { + let (_key, rem) = key.split_once('.').unwrap_or((key, "")); + match rem { + "service_account_path" => { + self.service_account_path.set(rem, value)?; + } + "service_account_key" => { + self.service_account_key.set(rem, value)?; + } + "application_credentials_path" => { + self.application_credentials_path.set(rem, value)?; + } + _ => { + return config_err!("Config value \"{}\" not found on GcpOptions", rem); + } + } + Ok(()) + } + + fn entries(&self) -> Vec { + struct Visitor(Vec); + + impl Visit for Visitor { + fn some( + &mut self, + key: &str, + value: V, + description: &'static str, + ) { + self.0.push(ConfigEntry { + key: key.to_string(), + value: Some(value.to_string()), + description, + }) + } + + fn none(&mut self, key: &str, description: &'static str) { + self.0.push(ConfigEntry { + key: key.to_string(), + value: None, + description, + }) + } + } + + let mut v = Visitor(vec![]); + self.service_account_path + .visit(&mut v, "service_account_path", ""); + self.service_account_key + .visit(&mut v, "service_account_key", ""); + self.application_credentials_path.visit( + &mut v, + "application_credentials_path", + "", + ); + v.0 + } +} + +impl ConfigExtension for GcpOptions { + const PREFIX: &'static str = "gcp"; +} + +pub(crate) async fn get_object_store( + state: &SessionState, + scheme: &str, + url: &Url, + table_options: &TableOptions, +) -> Result, DataFusionError> { + let store: Arc = match scheme { + "s3" => { + let Some(options) = table_options.extensions.get::() else { + return exec_err!( + "Given table options incompatible with the 's3' scheme" + ); + }; + let builder = get_s3_object_store_builder(url, options).await?; + Arc::new(builder.build()?) + } + "oss" => { + let Some(options) = table_options.extensions.get::() else { + return exec_err!( + "Given table options incompatible with the 'oss' scheme" + ); + }; + let builder = get_oss_object_store_builder(url, options)?; + Arc::new(builder.build()?) + } + "cos" => { + let Some(options) = table_options.extensions.get::() else { + return exec_err!( + "Given table options incompatible with the 'cos' scheme" + ); + }; + let builder = get_cos_object_store_builder(url, options)?; + Arc::new(builder.build()?) + } + "gs" | "gcs" => { + let Some(options) = table_options.extensions.get::() else { + return exec_err!( + "Given table options incompatible with the 'gs'/'gcs' scheme" + ); + }; + let builder = get_gcs_object_store_builder(url, options)?; + Arc::new(builder.build()?) + } + "http" | "https" => Arc::new( + HttpBuilder::new() + .with_client_options(ClientOptions::new().with_allow_http(true)) + .with_url(url.origin().ascii_serialization()) + .build()?, + ), + _ => { + // For other types, try to get from `object_store_registry`: + state + .runtime_env() + .object_store_registry + .get_store(url) + .map_err(|_| { + exec_datafusion_err!("Unsupported object store scheme: {}", scheme) + })? + } + }; + Ok(store) +} \ No newline at end of file diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index 05a9a666b..25938c44b 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -29,6 +29,10 @@ use bytes::Bytes; use chrono::{DateTime, NaiveDateTime, Timelike, Utc}; use datafusion::catalog::Session; use datafusion::common::stats::Precision; +use datafusion::common::Constraints; +use datafusion::config::TableParquetOptions; +use datafusion::datasource::listing::file_compression_type::FileCompressionType; +use datafusion::datasource::physical_plan::ParquetSource; use datafusion::logical_expr::utils::conjunction; use datafusion::physical_expr::LexOrdering; use datafusion::{ @@ -163,6 +167,10 @@ impl StandardTableProvider { limit, output_ordering: vec![LexOrdering::from_iter([sort_expr])], table_partition_cols: Vec::new(), + constraints: Constraints::default(), + file_compression_type: FileCompressionType::ZSTD, + new_lines_in_values: false, + source: Arc::new(ParquetSource::new(TableParquetOptions::default())), }, filters.as_ref(), ) @@ -338,6 +346,7 @@ impl StandardTableProvider { max_value: Precision::Exact(max), min_value: Precision::Exact(min), distinct_count: Precision::Absent, + sum_value: Precision::Absent }) .unwrap_or_default() }) diff --git a/src/utils/arrow/flight.rs b/src/utils/arrow/flight.rs index 8a78eb957..9f9af7186 100644 --- a/src/utils/arrow/flight.rs +++ b/src/utils/arrow/flight.rs @@ -1,160 +1,162 @@ -/* - * Parseable Server (C) 2022 - 2024 Parseable, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - */ - -use crate::event::Event; -use crate::handlers::http::ingest::push_logs_unchecked; -use crate::handlers::http::query::Query as QueryJson; -use crate::metadata::STREAM_INFO; -use crate::query::stream_schema_provider::include_now; -use crate::{ - handlers::http::modal::IngestorMetadata, - option::{Mode, CONFIG}, -}; - -use arrow_array::RecordBatch; -use arrow_flight::encode::FlightDataEncoderBuilder; -use arrow_flight::{FlightData, Ticket}; -use arrow_ipc::writer::IpcWriteOptions; -use arrow_select::concat::concat_batches; -use datafusion::logical_expr::BinaryExpr; -use datafusion::prelude::Expr; -use datafusion::scalar::ScalarValue; -use futures::{stream, TryStreamExt}; - -use tonic::{Request, Response, Status}; - -use arrow_flight::FlightClient; -// use http::Uri; -use tonic::transport::{Channel, Uri}; - -pub type DoGetStream = stream::BoxStream<'static, Result>; - -pub fn get_query_from_ticket(req: &Request) -> Result { - serde_json::from_slice::(&req.get_ref().ticket) - .map_err(|err| Status::internal(err.to_string())) -} - -pub async fn run_do_get_rpc( - im: IngestorMetadata, - ticket: String, -) -> Result, Status> { - let url = im - .domain_name - .rsplit_once(':') - .ok_or(Status::failed_precondition( - "Ingester metadata is courupted", - ))? - .0; - let url = format!("{}:{}", url, im.flight_port); - let url = url - .parse::() - .map_err(|_| Status::failed_precondition("Ingester metadata is courupted"))?; - let channel = Channel::builder(url) - .connect() - .await - .map_err(|err| Status::failed_precondition(err.to_string()))?; - - let client = FlightClient::new(channel); - let inn = client - .into_inner() - .accept_compressed(tonic::codec::CompressionEncoding::Gzip) - .max_decoding_message_size(usize::MAX) - .max_encoding_message_size(usize::MAX); - - let mut client = FlightClient::new_from_inner(inn); - - client.add_header("authorization", &im.token)?; - - let response = client - .do_get(Ticket { - ticket: ticket.into(), - }) - .await?; - - Ok(response.try_collect().await?) -} - -/// all the records from the ingesters are concatinated into one event and pushed to memory -pub async fn append_temporary_events( - stream_name: &str, - minute_result: Vec<&RecordBatch>, -) -> Result< - //Vec - Event, - Status, -> { - let schema = STREAM_INFO - .schema(stream_name) - .map_err(|err| Status::failed_precondition(format!("Metadata Error: {}", err)))?; - let rb = concat_batches(&schema, minute_result) - .map_err(|err| Status::failed_precondition(format!("ArrowError: {}", err)))?; - - let event = push_logs_unchecked(rb, stream_name) - .await - .map_err(|err| Status::internal(err.to_string()))?; - Ok(event) -} - -pub fn send_to_ingester(start: i64, end: i64) -> bool { - let filter_start = lit_timestamp_milli( - start, //query.start.timestamp_millis() - ); - let filter_end = lit_timestamp_milli( - end, //query.end.timestamp_millis() - ); - - let expr_left = Expr::Column(datafusion::common::Column { - relation: None, - name: "p_timestamp".to_owned(), - }); - - let ex1 = BinaryExpr::new( - Box::new(expr_left.clone()), - datafusion::logical_expr::Operator::Gt, - Box::new(filter_start), - ); - let ex2 = BinaryExpr::new( - Box::new(expr_left), - datafusion::logical_expr::Operator::Lt, - Box::new(filter_end), - ); - let ex = [Expr::BinaryExpr(ex1), Expr::BinaryExpr(ex2)]; - - CONFIG.options.mode == Mode::Query && include_now(&ex, &None) -} - -fn lit_timestamp_milli(time: i64) -> Expr { - Expr::Literal(ScalarValue::TimestampMillisecond(Some(time), None)) -} - -pub fn into_flight_data(records: Vec) -> Result, Status> { - let input_stream = futures::stream::iter(records.into_iter().map(Ok)); - let write_options = IpcWriteOptions::default() - .try_with_compression(Some(arrow_ipc::CompressionType(1))) - .map_err(|err| Status::failed_precondition(err.to_string()))?; - - let flight_data_stream = FlightDataEncoderBuilder::new() - .with_max_flight_data_size(usize::MAX) - .with_options(write_options) - // .with_schema(schema.into()) - .build(input_stream); - - let flight_data_stream = flight_data_stream.map_err(|err| Status::unknown(err.to_string())); - - Ok(Response::new(Box::pin(flight_data_stream) as DoGetStream)) -} +// /* +// * Parseable Server (C) 2022 - 2024 Parseable, Inc. +// * +// * This program is free software: you can redistribute it and/or modify +// * it under the terms of the GNU Affero General Public License as +// * published by the Free Software Foundation, either version 3 of the +// * License, or (at your option) any later version. +// * +// * This program is distributed in the hope that it will be useful, +// * but WITHOUT ANY WARRANTY; without even the implied warranty of +// * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// * GNU Affero General Public License for more details. +// * +// * You should have received a copy of the GNU Affero General Public License +// * along with this program. If not, see . +// * +// */ + +// use crate::event::Event; +// use crate::handlers::http::ingest::push_logs_unchecked; +// use crate::handlers::http::query::Query as QueryJson; +// use crate::metadata::STREAM_INFO; +// use crate::query::stream_schema_provider::include_now; +// use crate::{ +// handlers::http::modal::IngestorMetadata, +// option::{Mode, CONFIG}, +// }; + +// use arrow_array::RecordBatch; +// use arrow_flight::encode::FlightDataEncoderBuilder; +// use arrow_flight::{FlightData, Ticket}; +// use arrow_ipc::writer::IpcWriteOptions; +// use arrow_select::concat::concat_batches; +// use datafusion::common::{Location, Span}; +// use datafusion::logical_expr::BinaryExpr; +// use datafusion::prelude::Expr; +// use datafusion::scalar::ScalarValue; +// use futures::{stream, TryStreamExt}; + +// use tonic::{Request, Response, Status}; + +// use arrow_flight::FlightClient; +// // use http::Uri; +// use tonic::transport::{Channel, Uri}; + +// pub type DoGetStream = stream::BoxStream<'static, Result>; + +// pub fn get_query_from_ticket(req: &Request) -> Result { +// serde_json::from_slice::(&req.get_ref().ticket) +// .map_err(|err| Status::internal(err.to_string())) +// } + +// pub async fn run_do_get_rpc( +// im: IngestorMetadata, +// ticket: String, +// ) -> Result, Status> { +// let url = im +// .domain_name +// .rsplit_once(':') +// .ok_or(Status::failed_precondition( +// "Ingester metadata is courupted", +// ))? +// .0; +// let url = format!("{}:{}", url, im.flight_port); +// let url = url +// .parse::() +// .map_err(|_| Status::failed_precondition("Ingester metadata is courupted"))?; +// let channel = Channel::builder(url) +// .connect() +// .await +// .map_err(|err| Status::failed_precondition(err.to_string()))?; + +// let client = FlightClient::new(channel); +// let inn = client +// .into_inner() +// .accept_compressed(tonic::codec::CompressionEncoding::Gzip) +// .max_decoding_message_size(usize::MAX) +// .max_encoding_message_size(usize::MAX); + +// let mut client = FlightClient::new_from_inner(inn); + +// client.add_header("authorization", &im.token)?; + +// let response = client +// .do_get(Ticket { +// ticket: ticket.into(), +// }) +// .await?; + +// Ok(response.try_collect().await?) +// } + +// /// all the records from the ingesters are concatinated into one event and pushed to memory +// pub async fn append_temporary_events( +// stream_name: &str, +// minute_result: Vec<&RecordBatch>, +// ) -> Result< +// //Vec +// Event, +// Status, +// > { +// let schema = STREAM_INFO +// .schema(stream_name) +// .map_err(|err| Status::failed_precondition(format!("Metadata Error: {}", err)))?; +// let rb = concat_batches(&schema, minute_result) +// .map_err(|err| Status::failed_precondition(format!("ArrowError: {}", err)))?; + +// let event = push_logs_unchecked(rb, stream_name) +// .await +// .map_err(|err| Status::internal(err.to_string()))?; +// Ok(event) +// } + +// pub fn send_to_ingester(start: i64, end: i64) -> bool { +// let filter_start = lit_timestamp_milli( +// start, //query.start.timestamp_millis() +// ); +// let filter_end = lit_timestamp_milli( +// end, //query.end.timestamp_millis() +// ); + +// let expr_left = Expr::Column(datafusion::common::Column { +// relation: None, +// name: "p_timestamp".to_owned(), +// spans: Span::new(Location::new(0, 0), Location::new(0, 0)), +// }); + +// let ex1 = BinaryExpr::new( +// Box::new(expr_left.clone()), +// datafusion::logical_expr::Operator::Gt, +// Box::new(filter_start), +// ); +// let ex2 = BinaryExpr::new( +// Box::new(expr_left), +// datafusion::logical_expr::Operator::Lt, +// Box::new(filter_end), +// ); +// let ex = [Expr::BinaryExpr(ex1), Expr::BinaryExpr(ex2)]; + +// CONFIG.options.mode == Mode::Query && include_now(&ex, &None) +// } + +// fn lit_timestamp_milli(time: i64) -> Expr { +// Expr::Literal(ScalarValue::TimestampMillisecond(Some(time), None)) +// } + +// pub fn into_flight_data(records: Vec) -> Result, Status> { +// let input_stream = futures::stream::iter(records.into_iter().map(Ok)); +// let write_options = IpcWriteOptions::default() +// .try_with_compression(Some(arrow_ipc::CompressionType(1))) +// .map_err(|err| Status::failed_precondition(err.to_string()))?; + +// let flight_data_stream = FlightDataEncoderBuilder::new() +// .with_max_flight_data_size(usize::MAX) +// .with_options(write_options) +// // .with_schema(schema.into()) +// .build(input_stream); + +// let flight_data_stream = flight_data_stream.map_err(|err| Status::unknown(err.to_string())); + +// Ok(Response::new(Box::pin(flight_data_stream) as DoGetStream)) +// } From ba5f4ad8bf665ac9d75faf36e9662286440b5f86 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Wed, 12 Feb 2025 07:09:24 -0500 Subject: [PATCH 16/32] remove print --- src/query/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/query/mod.rs b/src/query/mod.rs index 8da7cb41b..2c6f6ca8b 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -669,7 +669,6 @@ pub async fn run() -> Result<()> { let queries_path: PathBuf = ["/home","ubuntu", "clickbench", "queries.sql"].iter().collect(); let queries = AllQueries::try_new(queries_path.as_path())?; println!("queries loaded"); - println!("query no. 1: {:?}", queries.get_query(1)?); let query_range = queries.min_query_id()..=queries.max_query_id(); // configure parquet options From ba7112ffaa794100280e536cb9adffa12dc95187 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Wed, 12 Feb 2025 23:42:35 -0500 Subject: [PATCH 17/32] create external table --- src/query/mod.rs | 64 ++++++++++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/src/query/mod.rs b/src/query/mod.rs index 2c6f6ca8b..091e5f57f 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -28,19 +28,22 @@ use chrono::NaiveDateTime; use chrono::{DateTime, Duration, Utc}; use datafusion::arrow::record_batch::RecordBatch; -use datafusion::common::exec_datafusion_err; +use datafusion::common::{exec_datafusion_err, plan_err}; use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeVisitor}; +use datafusion::config::ConfigFileType; +use datafusion::datasource::listing::ListingTableUrl; use datafusion::error::{DataFusionError, Result}; use datafusion::execution::disk_manager::DiskManagerConfig; use datafusion::execution::runtime_env::RuntimeEnvBuilder; use datafusion::execution::SessionStateBuilder; use datafusion::logical_expr::expr::Alias; use datafusion::logical_expr::{ - Aggregate, Explain, Filter, LogicalPlan, PlanType, Projection, ToStringifiedPlan, + Aggregate, DdlStatement, Explain, Filter, LogicalPlan, PlanType, Projection, ToStringifiedPlan }; use datafusion::prelude::*; use functions::ParquetMetadataFunc; use itertools::Itertools; +use object_storage::get_object_store; use once_cell::sync::Lazy; use relative_path::RelativePathBuf; use serde::{Deserialize, Serialize}; @@ -695,13 +698,12 @@ pub async fn run() -> Result<()> { // register `parquet_metadata` table function to get metadata from parquet files ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {})); - register_hits(&ctx).await?; + register_hits().await?; for query_id in query_range { let sql = queries.get_query(query_id)?; - let plan = ctx.state().create_logical_plan(sql).await?; let start = Instant::now(); - let df = ctx.execute_logical_plan(plan).await?; + let df = ctx.sql(sql).await?; let _ = df.collect().await?; let elapsed = start.elapsed().as_secs_f64(); println!("Q{query_id} took {elapsed} seconds"); @@ -711,25 +713,39 @@ pub async fn run() -> Result<()> { } /// Registers the `hits.parquet` as a table named `hits` -async fn register_hits(ctx: &SessionContext) -> Result<()> { - let options: ParquetReadOptions<'_> = Default::default(); - // options.table_partition_cols = vec!["date", "hour", "minute"] - // .iter() - // .map(|s| (s.to_string(), DataType::Utf8)) - // .collect(); - // let schema = STREAM_INFO.schema("hits").unwrap(); - // options.schema = Some(&schema); - let path: PathBuf = ["/home", "ubuntu", "clickbench", "hits.parquet"].iter().collect(); - let path = path.as_os_str().to_str().unwrap(); - println!("Registering 'hits' as {path}"); - ctx.register_parquet("hits", path, options) - .await - .map_err(|e| { - DataFusionError::Context( - format!("Registering 'hits' as {path}"), - Box::new(e), - ) - }) +async fn register_hits() -> Result<()> { + + let location = "/home/ubuntu/clickbench/hits.parquet"; + let sql = "CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '{location}' OPTIONS ('binary_as_string' 'true')"; + let ctx = SessionContext::new(); + let plan = ctx.state().create_logical_plan(sql).await?; + println!("plan: {plan}"); + if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { + println!("cmd: {:?}", cmd); + let format =Some(ConfigFileType::PARQUET); + let table_path = ListingTableUrl::parse(location)?; + println!("table_path: {table_path}"); + let scheme = table_path.scheme(); + let url = table_path.as_ref(); + // Clone and modify the default table options based on the provided options + let mut table_options = ctx.state().default_table_options(); + println!("table_options: {:?}", table_options); + if let Some(format) = format { + table_options.set_config_format(format); + } + table_options.alter_with_string_hash_map(&cmd.options)?; + // Retrieve the appropriate object store based on the scheme, URL, and modified table options + let store = + get_object_store(&ctx.state(), scheme, url, &table_options).await?; + + // Register the retrieved object store in the session context's runtime environment + ctx.register_object_store(url, store); + + } else { + return plan_err!("LogicalPlan is not a CreateExternalTable"); + } + + Ok(()) } pub mod error { From b8ee32105b5fc88c4680ca5ca435311260d42674 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Thu, 13 Feb 2025 07:28:11 -0500 Subject: [PATCH 18/32] execute plan --- Cargo.lock | 1 + Cargo.toml | 1 + src/event/format/json.rs | 21 ++- src/handlers/airplane.rs | 1 - src/handlers/http/query.rs | 11 +- src/query/catalog.rs | 29 ++-- src/query/functions.rs | 219 ++++++++++++++-------------- src/query/mod.rs | 111 +++++++------- src/query/object_storage.rs | 47 ++---- src/query/stream_schema_provider.rs | 2 +- src/utils/arrow/flight.rs | 1 - 11 files changed, 215 insertions(+), 229 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e4428d9ae..e0bd15fe0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3783,6 +3783,7 @@ dependencies = [ "serde_repr", "sha1_smol", "sha2", + "sqlparser", "static-files", "structopt-derive", "sysinfo", diff --git a/Cargo.toml b/Cargo.toml index e3b952695..5e097bea2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -136,6 +136,7 @@ structopt-derive = "0.4.18" dirs = "6.0.0" aws-config = "1.5.16" aws-credential-types = "1.2.1" +sqlparser = "0.54.0" [build-dependencies] cargo_toml = "0.20.1" diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 657cbdfc0..0b534aeee 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -114,23 +114,27 @@ impl EventFormat for Event { .iter() .map(|field| { if matches!(field.data_type(), DataType::Utf8View) { - Arc::new(Field::new(field.name(), DataType::Utf8, field.is_nullable())) + Arc::new(Field::new( + field.name(), + DataType::Utf8, + field.is_nullable(), + )) } else { field.clone() } }) .collect::>(), ); - + let array_capacity = round_upto_multiple_of_64(data.len()); let mut reader = ReaderBuilder::new(Arc::new(temp_schema)) .with_batch_size(array_capacity) .with_coerce_primitive(false) .with_strict_mode(false) .build_decoder()?; - + reader.serialize(&data)?; - + match reader.flush() { Ok(Some(temp_batch)) => { // Convert Utf8 arrays to Utf8View arrays where needed @@ -144,13 +148,18 @@ impl EventFormat for Event { .as_any() .downcast_ref::() .expect("Expected StringArray"); - Arc::new(StringViewArray::from(string_array.iter().map(|s| s.map(|s| s.to_string())).collect::>())) + Arc::new(StringViewArray::from( + string_array + .iter() + .map(|s| s.map(|s| s.to_string())) + .collect::>(), + )) } else { col.clone() } }) .collect(); - + Ok(RecordBatch::try_new(schema, new_columns)?) } Err(err) => Err(anyhow!("Failed to create recordbatch due to {:?}", err)), diff --git a/src/handlers/airplane.rs b/src/handlers/airplane.rs index abde7b8bd..92d5e12b4 100644 --- a/src/handlers/airplane.rs +++ b/src/handlers/airplane.rs @@ -15,7 +15,6 @@ // * along with this program. If not, see . // * // */ - // use arrow_array::RecordBatch; // use arrow_flight::flight_service_server::FlightServiceServer; // use arrow_flight::PollInfo; diff --git a/src/handlers/http/query.rs b/src/handlers/http/query.rs index f090eb93a..f5418c073 100644 --- a/src/handlers/http/query.rs +++ b/src/handlers/http/query.rs @@ -69,7 +69,6 @@ pub struct Query { } pub async fn query(req: HttpRequest, query_request: Query) -> Result { - let _ = run().await; println!("benchmarking complete"); let session_state = QUERY_SESSION.state(); @@ -116,7 +115,10 @@ pub async fn query(req: HttpRequest, query_request: Query) -> Result Result, - state: Weak>, - ) -> Self { + pub fn new(inner: Arc, state: Weak>) -> Self { Self { inner, state } } } @@ -51,9 +48,9 @@ impl CatalogProviderList for DynamicObjectStoreCatalog { fn catalog(&self, name: &str) -> Option> { let state = self.state.clone(); - self.inner.catalog(name).map(|catalog| { - Arc::new(DynamicObjectStoreCatalogProvider::new(catalog, state)) as _ - }) + self.inner + .catalog(name) + .map(|catalog| Arc::new(DynamicObjectStoreCatalogProvider::new(catalog, state)) as _) } } @@ -65,10 +62,7 @@ struct DynamicObjectStoreCatalogProvider { } impl DynamicObjectStoreCatalogProvider { - pub fn new( - inner: Arc, - state: Weak>, - ) -> Self { + pub fn new(inner: Arc, state: Weak>) -> Self { Self { inner, state } } } @@ -84,9 +78,9 @@ impl CatalogProvider for DynamicObjectStoreCatalogProvider { fn schema(&self, name: &str) -> Option> { let state = self.state.clone(); - self.inner.schema(name).map(|schema| { - Arc::new(DynamicObjectStoreSchemaProvider::new(schema, state)) as _ - }) + self.inner + .schema(name) + .map(|schema| Arc::new(DynamicObjectStoreSchemaProvider::new(schema, state)) as _) } fn register_schema( @@ -107,10 +101,7 @@ struct DynamicObjectStoreSchemaProvider { } impl DynamicObjectStoreSchemaProvider { - pub fn new( - inner: Arc, - state: Weak>, - ) -> Self { + pub fn new(inner: Arc, state: Weak>) -> Self { Self { inner, state } } } @@ -209,4 +200,4 @@ pub fn substitute_tilde(cur: String) -> String { } } cur -} \ No newline at end of file +} diff --git a/src/query/functions.rs b/src/query/functions.rs index 9c84987a5..55d4841eb 100644 --- a/src/query/functions.rs +++ b/src/query/functions.rs @@ -63,109 +63,109 @@ pub enum Function { // ]; impl Function { -// pub fn function_details(&self) -> Result<&str> { -// let details = match self { -// Function::Select => { -// r#" -// Command: SELECT -// Description: retrieve rows from a table or view -// Syntax: -// SELECT [ ALL | DISTINCT [ ON ( expression [, ...] ) ] ] -// [ * | expression [ [ AS ] output_name ] [, ...] ] -// [ FROM from_item [, ...] ] -// [ WHERE condition ] -// [ GROUP BY [ ALL | DISTINCT ] grouping_element [, ...] ] -// [ HAVING condition ] -// [ WINDOW window_name AS ( window_definition ) [, ...] ] -// [ { UNION | INTERSECT | EXCEPT } [ ALL | DISTINCT ] select ] -// [ ORDER BY expression [ ASC | DESC | USING operator ] [ NULLS { FIRST | LAST } ] [, ...] ] -// [ LIMIT { count | ALL } ] -// [ OFFSET start [ ROW | ROWS ] ] - -// where from_item can be one of: - -// [ ONLY ] table_name [ * ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] -// [ TABLESAMPLE sampling_method ( argument [, ...] ) [ REPEATABLE ( seed ) ] ] -// [ LATERAL ] ( select ) [ AS ] alias [ ( column_alias [, ...] ) ] -// with_query_name [ [ AS ] alias [ ( column_alias [, ...] ) ] ] -// [ LATERAL ] function_name ( [ argument [, ...] ] ) -// [ WITH ORDINALITY ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] -// [ LATERAL ] function_name ( [ argument [, ...] ] ) [ AS ] alias ( column_definition [, ...] ) -// [ LATERAL ] function_name ( [ argument [, ...] ] ) AS ( column_definition [, ...] ) -// [ LATERAL ] ROWS FROM( function_name ( [ argument [, ...] ] ) [ AS ( column_definition [, ...] ) ] [, ...] ) -// [ WITH ORDINALITY ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] -// from_item [ NATURAL ] join_type from_item [ ON join_condition | USING ( join_column [, ...] ) [ AS join_using_alias ] ] - -// and grouping_element can be one of: - -// ( ) -// expression -// ( expression [, ...] ) - -// and with_query is: - -// with_query_name [ ( column_name [, ...] ) ] AS [ [ NOT ] MATERIALIZED ] ( select | values | insert | update | delete ) - -// TABLE [ ONLY ] table_name [ * ]"# -// } -// Function::Explain => { -// r#" -// Command: EXPLAIN -// Description: show the execution plan of a statement -// Syntax: -// EXPLAIN [ ANALYZE ] statement -// "# -// } -// Function::Show => { -// r#" -// Command: SHOW -// Description: show the value of a run-time parameter -// Syntax: -// SHOW name -// "# -// } -// Function::CreateTable => { -// r#" -// Command: CREATE TABLE -// Description: define a new table -// Syntax: -// CREATE [ EXTERNAL ] TABLE table_name ( [ -// { column_name data_type } -// [, ... ] -// ] ) -// "# -// } -// Function::CreateTableAs => { -// r#" -// Command: CREATE TABLE AS -// Description: define a new table from the results of a query -// Syntax: -// CREATE TABLE table_name -// [ (column_name [, ...] ) ] -// AS query -// [ WITH [ NO ] DATA ] -// "# -// } -// Function::Insert => { -// r#" -// Command: INSERT -// Description: create new rows in a table -// Syntax: -// INSERT INTO table_name [ ( column_name [, ...] ) ] -// { VALUES ( { expression } [, ...] ) [, ...] } -// "# -// } -// Function::DropTable => { -// r#" -// Command: DROP TABLE -// Description: remove a table -// Syntax: -// DROP TABLE [ IF EXISTS ] name [, ...] -// "# -// } -// }; -// Ok(details) -// } + // pub fn function_details(&self) -> Result<&str> { + // let details = match self { + // Function::Select => { + // r#" + // Command: SELECT + // Description: retrieve rows from a table or view + // Syntax: + // SELECT [ ALL | DISTINCT [ ON ( expression [, ...] ) ] ] + // [ * | expression [ [ AS ] output_name ] [, ...] ] + // [ FROM from_item [, ...] ] + // [ WHERE condition ] + // [ GROUP BY [ ALL | DISTINCT ] grouping_element [, ...] ] + // [ HAVING condition ] + // [ WINDOW window_name AS ( window_definition ) [, ...] ] + // [ { UNION | INTERSECT | EXCEPT } [ ALL | DISTINCT ] select ] + // [ ORDER BY expression [ ASC | DESC | USING operator ] [ NULLS { FIRST | LAST } ] [, ...] ] + // [ LIMIT { count | ALL } ] + // [ OFFSET start [ ROW | ROWS ] ] + + // where from_item can be one of: + + // [ ONLY ] table_name [ * ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] + // [ TABLESAMPLE sampling_method ( argument [, ...] ) [ REPEATABLE ( seed ) ] ] + // [ LATERAL ] ( select ) [ AS ] alias [ ( column_alias [, ...] ) ] + // with_query_name [ [ AS ] alias [ ( column_alias [, ...] ) ] ] + // [ LATERAL ] function_name ( [ argument [, ...] ] ) + // [ WITH ORDINALITY ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] + // [ LATERAL ] function_name ( [ argument [, ...] ] ) [ AS ] alias ( column_definition [, ...] ) + // [ LATERAL ] function_name ( [ argument [, ...] ] ) AS ( column_definition [, ...] ) + // [ LATERAL ] ROWS FROM( function_name ( [ argument [, ...] ] ) [ AS ( column_definition [, ...] ) ] [, ...] ) + // [ WITH ORDINALITY ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] + // from_item [ NATURAL ] join_type from_item [ ON join_condition | USING ( join_column [, ...] ) [ AS join_using_alias ] ] + + // and grouping_element can be one of: + + // ( ) + // expression + // ( expression [, ...] ) + + // and with_query is: + + // with_query_name [ ( column_name [, ...] ) ] AS [ [ NOT ] MATERIALIZED ] ( select | values | insert | update | delete ) + + // TABLE [ ONLY ] table_name [ * ]"# + // } + // Function::Explain => { + // r#" + // Command: EXPLAIN + // Description: show the execution plan of a statement + // Syntax: + // EXPLAIN [ ANALYZE ] statement + // "# + // } + // Function::Show => { + // r#" + // Command: SHOW + // Description: show the value of a run-time parameter + // Syntax: + // SHOW name + // "# + // } + // Function::CreateTable => { + // r#" + // Command: CREATE TABLE + // Description: define a new table + // Syntax: + // CREATE [ EXTERNAL ] TABLE table_name ( [ + // { column_name data_type } + // [, ... ] + // ] ) + // "# + // } + // Function::CreateTableAs => { + // r#" + // Command: CREATE TABLE AS + // Description: define a new table from the results of a query + // Syntax: + // CREATE TABLE table_name + // [ (column_name [, ...] ) ] + // AS query + // [ WITH [ NO ] DATA ] + // "# + // } + // Function::Insert => { + // r#" + // Command: INSERT + // Description: create new rows in a table + // Syntax: + // INSERT INTO table_name [ ( column_name [, ...] ) ] + // { VALUES ( { expression } [, ...] ) [, ...] } + // "# + // } + // Function::DropTable => { + // r#" + // Command: DROP TABLE + // Description: remove a table + // Syntax: + // DROP TABLE [ IF EXISTS ] name [, ...] + // "# + // } + // }; + // Ok(details) + // } } impl FromStr for Function { @@ -324,14 +324,13 @@ impl TableFunctionImpl for ParquetMetadataFunc { Some(Expr::Literal(ScalarValue::Utf8(Some(s)))) => s, // single quote: parquet_metadata('x.parquet') Some(Expr::Column(Column { name, .. })) => name, // double quote: parquet_metadata("x.parquet") _ => { - return plan_err!( - "parquet_metadata requires string argument as its input" - ); + return plan_err!("parquet_metadata requires string argument as its input"); } }; let file = File::open(filename.clone())?; - let reader = SerializedFileReader::new(file).map_err(datafusion::error::DataFusionError::from)?; + let reader = + SerializedFileReader::new(file).map_err(datafusion::error::DataFusionError::from)?; let metadata = reader.metadata(); let schema = Arc::new(Schema::new(vec![ @@ -399,13 +398,11 @@ impl TableFunctionImpl for ParquetMetadataFunc { let converted_type = column.column_descr().converted_type(); if let Some(s) = column.statistics() { - let (min_val, max_val) = - convert_parquet_statistics(s, converted_type); + let (min_val, max_val) = convert_parquet_statistics(s, converted_type); stats_min_arr.push(min_val.clone()); stats_max_arr.push(max_val.clone()); stats_null_count_arr.push(s.null_count_opt().map(|c| c as i64)); - stats_distinct_count_arr - .push(s.distinct_count_opt().map(|c| c as i64)); + stats_distinct_count_arr.push(s.distinct_count_opt().map(|c| c as i64)); stats_min_value_arr.push(min_val); stats_max_value_arr.push(max_val); } else { diff --git a/src/query/mod.rs b/src/query/mod.rs index 091e5f57f..8621b3385 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -16,20 +16,20 @@ * */ -mod filter_optimizer; -mod listing_table_builder; -pub mod stream_schema_provider; pub mod catalog; +mod filter_optimizer; pub mod functions; +mod listing_table_builder; pub mod object_storage; +pub mod stream_schema_provider; use catalog::DynamicObjectStoreCatalog; use chrono::NaiveDateTime; use chrono::{DateTime, Duration, Utc}; use datafusion::arrow::record_batch::RecordBatch; -use datafusion::common::{exec_datafusion_err, plan_err}; use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeVisitor}; +use datafusion::common::{exec_datafusion_err, plan_err}; use datafusion::config::ConfigFileType; use datafusion::datasource::listing::ListingTableUrl; use datafusion::error::{DataFusionError, Result}; @@ -38,12 +38,14 @@ use datafusion::execution::runtime_env::RuntimeEnvBuilder; use datafusion::execution::SessionStateBuilder; use datafusion::logical_expr::expr::Alias; use datafusion::logical_expr::{ - Aggregate, DdlStatement, Explain, Filter, LogicalPlan, PlanType, Projection, ToStringifiedPlan + Aggregate, DdlStatement, Explain, Filter, LogicalPlan, PlanType, Projection, ToStringifiedPlan, }; +use datafusion::physical_plan::execution_plan::EmissionType; +use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties}; use datafusion::prelude::*; +use datafusion::sql::parser::DFParser; use functions::ParquetMetadataFunc; use itertools::Itertools; -use object_storage::get_object_store; use once_cell::sync::Lazy; use relative_path::RelativePathBuf; use serde::{Deserialize, Serialize}; @@ -80,8 +82,6 @@ pub struct Query { } impl Query { - - // create session context for this query pub fn create_session_context(storage: Arc) -> SessionContext { let runtime_config = storage @@ -116,7 +116,7 @@ impl Query { config.options_mut().execution.parquet.pushdown_filters = true; // Reorder filters allows DF to decide the order of filters minimizing the cost of filter evaluation - // config.options_mut().execution.parquet.reorder_filters = true; + // config.options_mut().execution.parquet.reorder_filters = true; // Enable StringViewArray // https://www.influxdata.com/blog/faster-queries-with-stringview-part-one-influxdb/ @@ -577,8 +577,8 @@ fn table_contains_any_time_filters( }) .any(|expr| { matches!(&*expr.left, Expr::Column(Column { name, .. }) - if ((time_partition.is_some() && name == time_partition.as_ref().unwrap()) || - (!time_partition.is_some() && name == event::DEFAULT_TIMESTAMP_KEY))) + if ((time_partition.is_some() && name == time_partition.as_ref().unwrap()) || + (!time_partition.is_some() && name == event::DEFAULT_TIMESTAMP_KEY))) }) } @@ -669,26 +669,29 @@ pub async fn run() -> Result<()> { let rt_config = RuntimeEnvBuilder::new(); let runtime_env = rt_config.build().unwrap(); println!("Running benchmarks"); - let queries_path: PathBuf = ["/home","ubuntu", "clickbench", "queries.sql"].iter().collect(); + let queries_path: PathBuf = ["/home", "ubuntu", "clickbench", "queries_bk.sql"] + .iter() + .collect(); let queries = AllQueries::try_new(queries_path.as_path())?; println!("queries loaded"); let query_range = queries.min_query_id()..=queries.max_query_id(); - + // configure parquet options let mut config = SessionConfig::new() - .with_parquet_pruning(true) - .with_target_partitions(num_cpus::get()) - .with_coalesce_batches(true) - .with_collect_statistics(true) - .with_parquet_page_index_pruning(true); + .with_parquet_pruning(true) + .with_target_partitions(num_cpus::get()) + .with_coalesce_batches(true) + .with_collect_statistics(true) + .with_parquet_page_index_pruning(true); config.options_mut().execution.parquet.binary_as_string = true; config.options_mut().execution.parquet.pushdown_filters = true; config.options_mut().execution.parquet.reorder_filters = true; - config.options_mut().execution.use_row_number_estimates_to_optimize_partitioning = true; + config + .options_mut() + .execution + .use_row_number_estimates_to_optimize_partitioning = true; // enable dynamic file query - let ctx = - SessionContext::new_with_config_rt(config, Arc::new(runtime_env)) - .enable_url_table(); + let ctx = SessionContext::new_with_config_rt(config, Arc::new(runtime_env)).enable_url_table(); ctx.refresh_catalogs().await?; // install dynamic catalog provider that can register required object stores ctx.register_catalog_list(Arc::new(DynamicObjectStoreCatalog::new( @@ -697,36 +700,19 @@ pub async fn run() -> Result<()> { ))); // register `parquet_metadata` table function to get metadata from parquet files ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {})); - - register_hits().await?; - - for query_id in query_range { - let sql = queries.get_query(query_id)?; - let start = Instant::now(); - let df = ctx.sql(sql).await?; - let _ = df.collect().await?; - let elapsed = start.elapsed().as_secs_f64(); - println!("Q{query_id} took {elapsed} seconds"); - - } - Ok(()) -} -/// Registers the `hits.parquet` as a table named `hits` -async fn register_hits() -> Result<()> { - let location = "/home/ubuntu/clickbench/hits.parquet"; - let sql = "CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '{location}' OPTIONS ('binary_as_string' 'true')"; - let ctx = SessionContext::new(); + let sql = "CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '/home/ubuntu/clickbench/hits.parquet' OPTIONS ('binary_as_string' 'true')"; + let task_ctx = ctx.task_ctx(); + let dialect = &task_ctx.session_config().options().sql_parser.dialect; + let dialect = sqlparser::dialect::dialect_from_str(dialect).unwrap(); let plan = ctx.state().create_logical_plan(sql).await?; println!("plan: {plan}"); if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { println!("cmd: {:?}", cmd); - let format =Some(ConfigFileType::PARQUET); + let format = Some(ConfigFileType::PARQUET); let table_path = ListingTableUrl::parse(location)?; println!("table_path: {table_path}"); - let scheme = table_path.scheme(); - let url = table_path.as_ref(); // Clone and modify the default table options based on the provided options let mut table_options = ctx.state().default_table_options(); println!("table_options: {:?}", table_options); @@ -734,20 +720,43 @@ async fn register_hits() -> Result<()> { table_options.set_config_format(format); } table_options.alter_with_string_hash_map(&cmd.options)?; - // Retrieve the appropriate object store based on the scheme, URL, and modified table options - let store = - get_object_store(&ctx.state(), scheme, url, &table_options).await?; - // Register the retrieved object store in the session context's runtime environment - ctx.register_object_store(url, store); - + ctx.sql(&sql).await?; + for query_id in query_range { + let sql = queries.get_query(query_id)?; + let statements = DFParser::parse_sql_with_dialect(&sql, dialect.as_ref())?; + for statement in statements { + let plan = ctx.state().statement_to_plan(statement).await?; + let df = ctx.execute_logical_plan(plan).await?; + let physical_plan = df.create_physical_plan().await?; + if physical_plan.boundedness().is_unbounded() { + if physical_plan.pipeline_behavior() == EmissionType::Final { + return plan_err!( + "The given query can generate a valid result only once \ + the source finishes, but the source is unbounded" + ); + } + // As the input stream comes, we can generate results. + // However, memory safety is not guaranteed. + let start = Instant::now(); + let _ = execute_stream(physical_plan, task_ctx.clone())?; + let elapsed = start.elapsed().as_secs_f64(); + println!("Q{query_id} took {elapsed} seconds"); + } else { + // Bounded stream; collected results are printed after all input consumed. + let start = Instant::now(); + let _ = collect(physical_plan, task_ctx.clone()).await?; + let elapsed = start.elapsed().as_secs_f64(); + println!("Q{query_id} took {elapsed} seconds"); + } + } + } } else { return plan_err!("LogicalPlan is not a CreateExternalTable"); } Ok(()) } - pub mod error { use crate::{metadata::error::stream_info::MetadataError, storage::ObjectStorageError}; use datafusion::error::DataFusionError; diff --git a/src/query/object_storage.rs b/src/query/object_storage.rs index 3a540eb94..bdc6d1bee 100644 --- a/src/query/object_storage.rs +++ b/src/query/object_storage.rs @@ -51,9 +51,7 @@ pub async fn get_s3_object_store_builder( let bucket_name = get_bucket_name(url)?; let mut builder = AmazonS3Builder::from_env().with_bucket_name(bucket_name); - if let (Some(access_key_id), Some(secret_access_key)) = - (access_key_id, secret_access_key) - { + if let (Some(access_key_id), Some(secret_access_key)) = (access_key_id, secret_access_key) { builder = builder .with_access_key_id(access_key_id) .with_secret_access_key(secret_access_key); @@ -267,12 +265,7 @@ impl ExtensionOptions for AwsOptions { struct Visitor(Vec); impl Visit for Visitor { - fn some( - &mut self, - key: &str, - value: V, - description: &'static str, - ) { + fn some(&mut self, key: &str, value: V, description: &'static str) { self.0.push(ConfigEntry { key: key.to_string(), value: Some(value.to_string()), @@ -352,12 +345,7 @@ impl ExtensionOptions for GcpOptions { struct Visitor(Vec); impl Visit for Visitor { - fn some( - &mut self, - key: &str, - value: V, - description: &'static str, - ) { + fn some(&mut self, key: &str, value: V, description: &'static str) { self.0.push(ConfigEntry { key: key.to_string(), value: Some(value.to_string()), @@ -379,11 +367,8 @@ impl ExtensionOptions for GcpOptions { .visit(&mut v, "service_account_path", ""); self.service_account_key .visit(&mut v, "service_account_key", ""); - self.application_credentials_path.visit( - &mut v, - "application_credentials_path", - "", - ); + self.application_credentials_path + .visit(&mut v, "application_credentials_path", ""); v.0 } } @@ -401,36 +386,28 @@ pub(crate) async fn get_object_store( let store: Arc = match scheme { "s3" => { let Some(options) = table_options.extensions.get::() else { - return exec_err!( - "Given table options incompatible with the 's3' scheme" - ); + return exec_err!("Given table options incompatible with the 's3' scheme"); }; let builder = get_s3_object_store_builder(url, options).await?; Arc::new(builder.build()?) } "oss" => { let Some(options) = table_options.extensions.get::() else { - return exec_err!( - "Given table options incompatible with the 'oss' scheme" - ); + return exec_err!("Given table options incompatible with the 'oss' scheme"); }; let builder = get_oss_object_store_builder(url, options)?; Arc::new(builder.build()?) } "cos" => { let Some(options) = table_options.extensions.get::() else { - return exec_err!( - "Given table options incompatible with the 'cos' scheme" - ); + return exec_err!("Given table options incompatible with the 'cos' scheme"); }; let builder = get_cos_object_store_builder(url, options)?; Arc::new(builder.build()?) } "gs" | "gcs" => { let Some(options) = table_options.extensions.get::() else { - return exec_err!( - "Given table options incompatible with the 'gs'/'gcs' scheme" - ); + return exec_err!("Given table options incompatible with the 'gs'/'gcs' scheme"); }; let builder = get_gcs_object_store_builder(url, options)?; Arc::new(builder.build()?) @@ -447,10 +424,8 @@ pub(crate) async fn get_object_store( .runtime_env() .object_store_registry .get_store(url) - .map_err(|_| { - exec_datafusion_err!("Unsupported object store scheme: {}", scheme) - })? + .map_err(|_| exec_datafusion_err!("Unsupported object store scheme: {}", scheme))? } }; Ok(store) -} \ No newline at end of file +} diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index 25938c44b..b03c7849e 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -346,7 +346,7 @@ impl StandardTableProvider { max_value: Precision::Exact(max), min_value: Precision::Exact(min), distinct_count: Precision::Absent, - sum_value: Precision::Absent + sum_value: Precision::Absent, }) .unwrap_or_default() }) diff --git a/src/utils/arrow/flight.rs b/src/utils/arrow/flight.rs index 9f9af7186..e5bb45f8b 100644 --- a/src/utils/arrow/flight.rs +++ b/src/utils/arrow/flight.rs @@ -15,7 +15,6 @@ // * along with this program. If not, see . // * // */ - // use crate::event::Event; // use crate::handlers::http::ingest::push_logs_unchecked; // use crate::handlers::http::query::Query as QueryJson; From 3565bac61b77cd7f326af1beb5da42e4c52a9551 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Thu, 13 Feb 2025 22:06:56 -0500 Subject: [PATCH 19/32] simplify --- Cargo.lock | 14 ++ Cargo.toml | 2 + src/handlers/http/query.rs | 4 +- src/query/catalog.rs | 10 +- src/query/mod.rs | 320 ++++++++++++++++++++++--------------- 5 files changed, 213 insertions(+), 137 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e0bd15fe0..73dbd3b24 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3343,6 +3343,18 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "nix" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46" +dependencies = [ + "bitflags 2.8.0", + "cfg-if", + "cfg_aliases", + "libc", +] + [[package]] name = "nom" version = "7.1.3" @@ -3758,6 +3770,8 @@ dependencies = [ "humantime-serde", "itertools 0.13.0", "lazy_static", + "libc", + "nix", "nom", "num_cpus", "object_store", diff --git a/Cargo.toml b/Cargo.toml index 5e097bea2..2130770f1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -137,6 +137,8 @@ dirs = "6.0.0" aws-config = "1.5.16" aws-credential-types = "1.2.1" sqlparser = "0.54.0" +nix = {version = "0.29.0", features = ["fs", "mman"]} +libc = "0.2.169" [build-dependencies] cargo_toml = "0.20.1" diff --git a/src/handlers/http/query.rs b/src/handlers/http/query.rs index f5418c073..7e1ddefb9 100644 --- a/src/handlers/http/query.rs +++ b/src/handlers/http/query.rs @@ -41,7 +41,7 @@ use crate::event::commit_schema; use crate::metrics::QUERY_EXECUTE_TIME; use crate::option::{Mode, CONFIG}; use crate::query::error::ExecuteError; -use crate::query::{run, CountsRequest, CountsResponse, Query as LogicalQuery}; +use crate::query::{run_benchmark, CountsRequest, CountsResponse, Query as LogicalQuery}; use crate::query::{TableScanVisitor, QUERY_SESSION}; use crate::rbac::Users; use crate::response::QueryResponse; @@ -69,7 +69,7 @@ pub struct Query { } pub async fn query(req: HttpRequest, query_request: Query) -> Result { - let _ = run().await; + let _ = run_benchmark().await; println!("benchmarking complete"); let session_state = QUERY_SESSION.state(); let raw_logical_plan = match session_state diff --git a/src/query/catalog.rs b/src/query/catalog.rs index 494ef95ef..9d713a188 100644 --- a/src/query/catalog.rs +++ b/src/query/catalog.rs @@ -23,11 +23,11 @@ pub struct DynamicObjectStoreCatalog { state: Weak>, } -impl DynamicObjectStoreCatalog { - pub fn new(inner: Arc, state: Weak>) -> Self { - Self { inner, state } - } -} +// impl DynamicObjectStoreCatalog { +// pub fn new(inner: Arc, state: Weak>) -> Self { +// Self { inner, state } +// } +// } impl CatalogProviderList for DynamicObjectStoreCatalog { fn as_any(&self) -> &dyn Any { diff --git a/src/query/mod.rs b/src/query/mod.rs index 8621b3385..cc9eb6156 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -23,40 +23,42 @@ mod listing_table_builder; pub mod object_storage; pub mod stream_schema_provider; -use catalog::DynamicObjectStoreCatalog; +// use catalog::DynamicObjectStoreCatalog; use chrono::NaiveDateTime; use chrono::{DateTime, Duration, Utc}; use datafusion::arrow::record_batch::RecordBatch; use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeVisitor}; -use datafusion::common::{exec_datafusion_err, plan_err}; -use datafusion::config::ConfigFileType; -use datafusion::datasource::listing::ListingTableUrl; +// use datafusion::common::{exec_datafusion_err, plan_err}; +// use datafusion::config::ConfigFileType; use datafusion::error::{DataFusionError, Result}; use datafusion::execution::disk_manager::DiskManagerConfig; -use datafusion::execution::runtime_env::RuntimeEnvBuilder; +// use datafusion::execution::runtime_env::RuntimeEnvBuilder; use datafusion::execution::SessionStateBuilder; use datafusion::logical_expr::expr::Alias; use datafusion::logical_expr::{ - Aggregate, DdlStatement, Explain, Filter, LogicalPlan, PlanType, Projection, ToStringifiedPlan, + Aggregate, Explain, Filter, LogicalPlan, PlanType, Projection, ToStringifiedPlan, }; -use datafusion::physical_plan::execution_plan::EmissionType; -use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties}; +// use datafusion::physical_plan::execution_plan::EmissionType; +// use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties}; use datafusion::prelude::*; -use datafusion::sql::parser::DFParser; -use functions::ParquetMetadataFunc; +// use datafusion::sql::parser::DFParser; use itertools::Itertools; use once_cell::sync::Lazy; use relative_path::RelativePathBuf; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; use std::ops::Bound; -use std::path::{Path, PathBuf}; +// use std::path::{Path, PathBuf}; +use std::process::Command; use std::sync::Arc; use std::time::Instant; use stream_schema_provider::collect_manifest_files; use sysinfo::System; +use std::fs; +use std::io; + use self::error::ExecuteError; use self::stream_schema_provider::GlobalSchemaProvider; pub use self::stream_schema_provider::PartialTimeFilter; @@ -628,133 +630,191 @@ pub fn flatten_objects_for_count(objects: Vec) -> Vec { } } -struct AllQueries { - queries: Vec, +// struct AllQueries { +// queries: Vec, +// } + +// impl AllQueries { +// fn try_new(path: &Path) -> Result { +// // ClickBench has all queries in a single file identified by line number +// let all_queries = std::fs::read_to_string(path) +// .map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?; +// Ok(Self { +// queries: all_queries.lines().map(|s| s.to_string()).collect(), +// }) +// } + +// /// Returns the text of query `query_id` +// fn get_query(&self, query_id: usize) -> Result<&str> { +// self.queries +// .get(query_id) +// .ok_or_else(|| { +// let min_id = self.min_query_id(); +// let max_id = self.max_query_id(); +// exec_datafusion_err!( +// "Invalid query id {query_id}. Must be between {min_id} and {max_id}" +// ) +// }) +// .map(|s| s.as_str()) +// } + +// fn min_query_id(&self) -> usize { +// 0 +// } + +// fn max_query_id(&self) -> usize { +// self.queries.len() - 1 +// } +// } + +// pub async fn run() -> Result<()> { +// let rt_config = RuntimeEnvBuilder::new(); +// let runtime_env = rt_config.build().unwrap(); +// println!("Running benchmarks"); +// let queries_path: PathBuf = ["/home", "ubuntu", "queries.sql"] +// .iter() +// .collect(); +// let queries = AllQueries::try_new(queries_path.as_path())?; +// println!("queries loaded"); +// let query_range = queries.min_query_id()..=queries.max_query_id(); + +// // configure parquet options +// let mut config = SessionConfig::new() +// .with_parquet_pruning(true) +// .with_target_partitions(num_cpus::get()) +// .with_coalesce_batches(true) +// .with_collect_statistics(true) +// .with_parquet_page_index_pruning(true); +// config.options_mut().execution.parquet.binary_as_string = true; +// config.options_mut().execution.parquet.pushdown_filters = true; +// config.options_mut().execution.parquet.reorder_filters = true; +// config +// .options_mut() +// .execution +// .use_row_number_estimates_to_optimize_partitioning = true; +// // enable dynamic file query +// let ctx = SessionContext::new_with_config_rt(config, Arc::new(runtime_env)).enable_url_table(); +// ctx.refresh_catalogs().await?; +// // install dynamic catalog provider that can register required object stores +// ctx.register_catalog_list(Arc::new(DynamicObjectStoreCatalog::new( +// ctx.state().catalog_list().clone(), +// ctx.state_weak_ref(), +// ))); + +// let sql = "CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '/home/ubuntu/clickbench/hits.parquet' OPTIONS ('binary_as_string' 'true')"; +// let task_ctx = ctx.task_ctx(); +// let dialect = &task_ctx.session_config().options().sql_parser.dialect; +// let dialect = sqlparser::dialect::dialect_from_str(dialect).unwrap(); +// let plan = ctx.state().create_logical_plan(sql).await?; +// if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { +// let format = Some(ConfigFileType::PARQUET); +// // Clone and modify the default table options based on the provided options +// let mut table_options = ctx.state().default_table_options(); +// if let Some(format) = format { +// table_options.set_config_format(format); +// } +// table_options.alter_with_string_hash_map(&cmd.options)?; + +// ctx.sql(&sql).await?; +// for query_id in query_range { +// let sql = queries.get_query(query_id)?; +// let statements = DFParser::parse_sql_with_dialect(&sql, dialect.as_ref())?; +// for statement in statements { +// let plan = ctx.state().statement_to_plan(statement).await?; +// let df = ctx.execute_logical_plan(plan).await?; +// let physical_plan = df.create_physical_plan().await?; +// if physical_plan.boundedness().is_unbounded() { +// if physical_plan.pipeline_behavior() == EmissionType::Final { +// return plan_err!( +// "The given query can generate a valid result only once \ +// the source finishes, but the source is unbounded" +// ); +// } +// // As the input stream comes, we can generate results. +// // However, memory safety is not guaranteed. +// let start = Instant::now(); +// let _ = execute_stream(physical_plan, task_ctx.clone())?; +// let elapsed = start.elapsed().as_secs_f64(); +// println!("Query{query_id} took {elapsed} seconds"); +// } else { +// // Bounded stream; collected results are printed after all input consumed. +// let start = Instant::now(); +// let _ = collect(physical_plan, task_ctx.clone()).await?; +// let elapsed = start.elapsed().as_secs_f64(); +// println!("Q{query_id} took {elapsed} seconds"); +// } +// } +// } +// } else { +// return plan_err!("LogicalPlan is not a CreateExternalTable"); +// } + +// Ok(()) +// } + +#[derive(Debug, Serialize)] +struct BenchmarkResult { + query_num: usize, + iteration: usize, + elapsed_seconds: f64, } -impl AllQueries { - fn try_new(path: &Path) -> Result { - // ClickBench has all queries in a single file identified by line number - let all_queries = std::fs::read_to_string(path) - .map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?; - Ok(Self { - queries: all_queries.lines().map(|s| s.to_string()).collect(), - }) - } - - /// Returns the text of query `query_id` - fn get_query(&self, query_id: usize) -> Result<&str> { - self.queries - .get(query_id) - .ok_or_else(|| { - let min_id = self.min_query_id(); - let max_id = self.max_query_id(); - exec_datafusion_err!( - "Invalid query id {query_id}. Must be between {min_id} and {max_id}" - ) - }) - .map(|s| s.as_str()) - } - - fn min_query_id(&self) -> usize { - 0 - } - - fn max_query_id(&self) -> usize { - self.queries.len() - 1 - } +#[derive(Debug, Serialize)] +struct BenchmarkResponse { + results: Vec, } -pub async fn run() -> Result<()> { - let rt_config = RuntimeEnvBuilder::new(); - let runtime_env = rt_config.build().unwrap(); - println!("Running benchmarks"); - let queries_path: PathBuf = ["/home", "ubuntu", "clickbench", "queries_bk.sql"] - .iter() - .collect(); - let queries = AllQueries::try_new(queries_path.as_path())?; - println!("queries loaded"); - let query_range = queries.min_query_id()..=queries.max_query_id(); - - // configure parquet options - let mut config = SessionConfig::new() - .with_parquet_pruning(true) - .with_target_partitions(num_cpus::get()) - .with_coalesce_batches(true) - .with_collect_statistics(true) - .with_parquet_page_index_pruning(true); - config.options_mut().execution.parquet.binary_as_string = true; - config.options_mut().execution.parquet.pushdown_filters = true; - config.options_mut().execution.parquet.reorder_filters = true; - config - .options_mut() - .execution - .use_row_number_estimates_to_optimize_partitioning = true; - // enable dynamic file query - let ctx = SessionContext::new_with_config_rt(config, Arc::new(runtime_env)).enable_url_table(); - ctx.refresh_catalogs().await?; - // install dynamic catalog provider that can register required object stores - ctx.register_catalog_list(Arc::new(DynamicObjectStoreCatalog::new( - ctx.state().catalog_list().clone(), - ctx.state_weak_ref(), - ))); - // register `parquet_metadata` table function to get metadata from parquet files - ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {})); - - let location = "/home/ubuntu/clickbench/hits.parquet"; +pub async fn run_benchmark() { + const TRIES: usize = 1; + + let mut results = Vec::new(); + let mut query_num = 1; + // Create session context + let ctx = SessionContext::new(); let sql = "CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '/home/ubuntu/clickbench/hits.parquet' OPTIONS ('binary_as_string' 'true')"; - let task_ctx = ctx.task_ctx(); - let dialect = &task_ctx.session_config().options().sql_parser.dialect; - let dialect = sqlparser::dialect::dialect_from_str(dialect).unwrap(); - let plan = ctx.state().create_logical_plan(sql).await?; - println!("plan: {plan}"); - if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { - println!("cmd: {:?}", cmd); - let format = Some(ConfigFileType::PARQUET); - let table_path = ListingTableUrl::parse(location)?; - println!("table_path: {table_path}"); - // Clone and modify the default table options based on the provided options - let mut table_options = ctx.state().default_table_options(); - println!("table_options: {:?}", table_options); - if let Some(format) = format { - table_options.set_config_format(format); - } - table_options.alter_with_string_hash_map(&cmd.options)?; - - ctx.sql(&sql).await?; - for query_id in query_range { - let sql = queries.get_query(query_id)?; - let statements = DFParser::parse_sql_with_dialect(&sql, dialect.as_ref())?; - for statement in statements { - let plan = ctx.state().statement_to_plan(statement).await?; - let df = ctx.execute_logical_plan(plan).await?; - let physical_plan = df.create_physical_plan().await?; - if physical_plan.boundedness().is_unbounded() { - if physical_plan.pipeline_behavior() == EmissionType::Final { - return plan_err!( - "The given query can generate a valid result only once \ - the source finishes, but the source is unbounded" - ); - } - // As the input stream comes, we can generate results. - // However, memory safety is not guaranteed. - let start = Instant::now(); - let _ = execute_stream(physical_plan, task_ctx.clone())?; - let elapsed = start.elapsed().as_secs_f64(); - println!("Q{query_id} took {elapsed} seconds"); - } else { - // Bounded stream; collected results are printed after all input consumed. - let start = Instant::now(); - let _ = collect(physical_plan, task_ctx.clone()).await?; - let elapsed = start.elapsed().as_secs_f64(); - println!("Q{query_id} took {elapsed} seconds"); - } - } + let _ = ctx.sql(&sql).await.unwrap().collect().await.unwrap(); + // Read queries from file + let queries = fs::read_to_string("/home/ubuntu/queries.sql").unwrap(); + + + for query in queries.lines() { + // Write current query to temporary file + fs::write("/tmp/query.sql", &query).unwrap(); + + for iteration in 1..=TRIES { + // Clear caches + clear_caches().unwrap(); + + // Execute and time the query + let start = Instant::now(); + ctx.sql(&query).await.unwrap().collect().await.unwrap(); + let elapsed = start.elapsed().as_secs_f64(); + let benchmark_result = BenchmarkResult { + query_num, + iteration, + elapsed_seconds: elapsed, + }; + println!("Query {query_num} iteration {iteration} took {elapsed} seconds"); + // Record result + results.push(benchmark_result); + } - } else { - return plan_err!("LogicalPlan is not a CreateExternalTable"); + query_num += 1; } + println!("{}", serde_json::to_string(&BenchmarkResponse { results }).unwrap()); +} + +fn clear_caches() -> io::Result<()> { + // Sync filesystems + Command::new("sync").status()?; + + // Clear caches using sudo + Command::new("sudo") + .args(&["tee", "/proc/sys/vm/drop_caches"]) + .arg("3") + .output()?; + Ok(()) } pub mod error { From 84ae3e8941687dca34a0a15f704c7080888a785d Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Fri, 14 Feb 2025 06:25:21 -0500 Subject: [PATCH 20/32] added coalesceBatchesExec, FilterExec, RepartitionExec --- src/query/mod.rs | 115 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 102 insertions(+), 13 deletions(-) diff --git a/src/query/mod.rs b/src/query/mod.rs index cc9eb6156..4cb98da89 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -33,12 +33,18 @@ use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, Tr // use datafusion::config::ConfigFileType; use datafusion::error::{DataFusionError, Result}; use datafusion::execution::disk_manager::DiskManagerConfig; +use datafusion::execution::runtime_env::{RuntimeEnv, RuntimeEnvBuilder}; // use datafusion::execution::runtime_env::RuntimeEnvBuilder; use datafusion::execution::SessionStateBuilder; use datafusion::logical_expr::expr::Alias; use datafusion::logical_expr::{ Aggregate, Explain, Filter, LogicalPlan, PlanType, Projection, ToStringifiedPlan, }; +use datafusion::physical_expr::create_physical_expr; +use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; +use datafusion::physical_plan::repartition::RepartitionExec; +use datafusion::physical_plan::{collect as PhysicalPlanCollect, ExecutionPlan, Partitioning}; +use datafusion::physical_plan::filter::FilterExec; // use datafusion::physical_plan::execution_plan::EmissionType; // use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties}; use datafusion::prelude::*; @@ -759,18 +765,36 @@ struct BenchmarkResult { elapsed_seconds: f64, } -#[derive(Debug, Serialize)] -struct BenchmarkResponse { - results: Vec, -} pub async fn run_benchmark() { const TRIES: usize = 1; let mut results = Vec::new(); let mut query_num = 1; + + // 1. Configure Runtime Environment with parallelism + let runtime_config = RuntimeEnvBuilder::new() // Number of partitions for parallel processing + .with_disk_manager(DiskManagerConfig::NewOs); + + let runtime = RuntimeEnv::new(runtime_config).unwrap(); + + // Create session context - let ctx = SessionContext::new(); + let mut config = SessionConfig::new().with_coalesce_batches(true) + .with_target_partitions(8) + .with_batch_size(50000); + config.options_mut().execution.parquet.binary_as_string = true; + config.options_mut().execution.use_row_number_estimates_to_optimize_partitioning = true; + config.options_mut().execution.parquet.pushdown_filters = true; + config.options_mut().execution.parquet.enable_page_index = true; + config.options_mut().execution.parquet.pruning = true; + config.options_mut().execution.parquet.reorder_filters = true; + let state = SessionStateBuilder::new() + .with_default_features() + .with_config(config) + .with_runtime_env(Arc::new(runtime)) + .build(); + let ctx = SessionContext::new_with_state(state); let sql = "CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '/home/ubuntu/clickbench/hits.parquet' OPTIONS ('binary_as_string' 'true')"; let _ = ctx.sql(&sql).await.unwrap().collect().await.unwrap(); // Read queries from file @@ -778,16 +802,84 @@ pub async fn run_benchmark() { for query in queries.lines() { - // Write current query to temporary file fs::write("/tmp/query.sql", &query).unwrap(); - + for iteration in 1..=TRIES { - // Clear caches clear_caches().unwrap(); + + + // Create the query plan + let df = ctx.sql(&query).await.unwrap(); + let logical_plan = df.logical_plan().clone(); + let physical_plan = df.create_physical_plan().await.unwrap(); + + // Add coalesce + let mut exec_plan: Arc = Arc::new(CoalesceBatchesExec::new(physical_plan, 50000)); + + // Check if plan contains filter and add FilterExec + fn has_filter(plan: &LogicalPlan) -> bool { + match plan { + LogicalPlan::Filter(_) => true, + LogicalPlan::Projection(proj) => has_filter(proj.input.as_ref()), + LogicalPlan::Aggregate(agg) => has_filter(agg.input.as_ref()), + LogicalPlan::Join(join) => { + has_filter(join.left.as_ref()) || has_filter(join.right.as_ref()) + }, + LogicalPlan::Window(window) => has_filter(window.input.as_ref()), + LogicalPlan::Sort(sort) => has_filter(sort.input.as_ref()), + LogicalPlan::Limit(limit) => has_filter(limit.input.as_ref()), + _ => false, + } + } + + // Extract filter expressions from logical plan + fn extract_filters(plan: &LogicalPlan) -> Vec { + match plan { + LogicalPlan::Filter(filter) => vec![filter.predicate.clone()], + LogicalPlan::Projection(proj) => extract_filters(proj.input.as_ref()), + LogicalPlan::Aggregate(agg) => extract_filters(agg.input.as_ref()), + LogicalPlan::Join(join) => { + let mut filters = extract_filters(join.left.as_ref()); + filters.extend(extract_filters(join.right.as_ref())); + filters + }, + _ => vec![], + } + } + + if has_filter(&logical_plan) { + let filters = extract_filters(&logical_plan); + for filter_expr in filters { + + + if let Ok(physical_filter_expr) = create_physical_expr( + &filter_expr, + &logical_plan.schema(), + &ctx.state().execution_props(), + + ) { + exec_plan = Arc::new(FilterExec::try_new( + physical_filter_expr, + exec_plan, + ).unwrap()); + } - // Execute and time the query + + } + } + + // Execute the plan + let task_ctx = ctx.task_ctx(); let start = Instant::now(); - ctx.sql(&query).await.unwrap().collect().await.unwrap(); + + //let _ = execute_parallel(exec_plan.clone(), ctx.task_ctx()).await.unwrap(); + // Add repartitioning for better parallelism + let repartitioned = Arc::new(RepartitionExec::try_new( + exec_plan, + Partitioning::RoundRobinBatch(8), + ).unwrap()); + let _ = PhysicalPlanCollect(repartitioned, task_ctx).await.unwrap(); + let elapsed = start.elapsed().as_secs_f64(); let benchmark_result = BenchmarkResult { query_num, @@ -795,14 +887,11 @@ pub async fn run_benchmark() { elapsed_seconds: elapsed, }; println!("Query {query_num} iteration {iteration} took {elapsed} seconds"); - // Record result results.push(benchmark_result); - } query_num += 1; } - println!("{}", serde_json::to_string(&BenchmarkResponse { results }).unwrap()); } fn clear_caches() -> io::Result<()> { From e7593a9f54843ae2ba5a944f633e8a03cd9e086e Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Sun, 16 Feb 2025 01:34:17 -0500 Subject: [PATCH 21/32] CoalesceBatchesExec and RepartitionExec --- src/handlers/http/query.rs | 1 - src/query/mod.rs | 177 +++++++++++++++---------------------- 2 files changed, 71 insertions(+), 107 deletions(-) diff --git a/src/handlers/http/query.rs b/src/handlers/http/query.rs index 7e1ddefb9..87cabb0b5 100644 --- a/src/handlers/http/query.rs +++ b/src/handlers/http/query.rs @@ -70,7 +70,6 @@ pub struct Query { pub async fn query(req: HttpRequest, query_request: Query) -> Result { let _ = run_benchmark().await; - println!("benchmarking complete"); let session_state = QUERY_SESSION.state(); let raw_logical_plan = match session_state .create_logical_plan(&query_request.query) diff --git a/src/query/mod.rs b/src/query/mod.rs index 4cb98da89..b4995a8c2 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -33,18 +33,16 @@ use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, Tr // use datafusion::config::ConfigFileType; use datafusion::error::{DataFusionError, Result}; use datafusion::execution::disk_manager::DiskManagerConfig; -use datafusion::execution::runtime_env::{RuntimeEnv, RuntimeEnvBuilder}; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; // use datafusion::execution::runtime_env::RuntimeEnvBuilder; use datafusion::execution::SessionStateBuilder; use datafusion::logical_expr::expr::Alias; use datafusion::logical_expr::{ Aggregate, Explain, Filter, LogicalPlan, PlanType, Projection, ToStringifiedPlan, }; -use datafusion::physical_expr::create_physical_expr; use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; use datafusion::physical_plan::repartition::RepartitionExec; -use datafusion::physical_plan::{collect as PhysicalPlanCollect, ExecutionPlan, Partitioning}; -use datafusion::physical_plan::filter::FilterExec; +use datafusion::physical_plan::{collect, ExecutionPlan, Partitioning}; // use datafusion::physical_plan::execution_plan::EmissionType; // use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties}; use datafusion::prelude::*; @@ -56,14 +54,12 @@ use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; use std::ops::Bound; // use std::path::{Path, PathBuf}; -use std::process::Command; use std::sync::Arc; use std::time::Instant; use stream_schema_provider::collect_manifest_files; use sysinfo::System; use std::fs; -use std::io; use self::error::ExecuteError; use self::stream_schema_provider::GlobalSchemaProvider; @@ -758,31 +754,28 @@ pub fn flatten_objects_for_count(objects: Vec) -> Vec { // Ok(()) // } -#[derive(Debug, Serialize)] -struct BenchmarkResult { - query_num: usize, - iteration: usize, - elapsed_seconds: f64, -} - - pub async fn run_benchmark() { const TRIES: usize = 1; - - let mut results = Vec::new(); let mut query_num = 1; - - // 1. Configure Runtime Environment with parallelism let runtime_config = RuntimeEnvBuilder::new() // Number of partitions for parallel processing .with_disk_manager(DiskManagerConfig::NewOs); - let runtime = RuntimeEnv::new(runtime_config).unwrap(); + + let runtime = runtime_config.build().unwrap(); // Create session context let mut config = SessionConfig::new().with_coalesce_batches(true) - .with_target_partitions(8) - .with_batch_size(50000); + .with_collect_statistics(true) + .with_parquet_bloom_filter_pruning(true) + .with_parquet_page_index_pruning(true) + .with_parquet_pruning(true) + .with_prefer_existing_sort(true) + .with_repartition_file_scans(true) + .with_round_robin_repartition(true) + .with_repartition_sorts(true) + .with_batch_size(50000) + .with_target_partitions(8); config.options_mut().execution.parquet.binary_as_string = true; config.options_mut().execution.use_row_number_estimates_to_optimize_partitioning = true; config.options_mut().execution.parquet.pushdown_filters = true; @@ -800,112 +793,84 @@ pub async fn run_benchmark() { // Read queries from file let queries = fs::read_to_string("/home/ubuntu/queries.sql").unwrap(); - + let mut total_elapsed = 0.0; for query in queries.lines() { fs::write("/tmp/query.sql", &query).unwrap(); - for iteration in 1..=TRIES { - clear_caches().unwrap(); - + for iteration in 1..=TRIES { // Create the query plan let df = ctx.sql(&query).await.unwrap(); - let logical_plan = df.logical_plan().clone(); + //let logical_plan = df.logical_plan().clone(); let physical_plan = df.create_physical_plan().await.unwrap(); // Add coalesce - let mut exec_plan: Arc = Arc::new(CoalesceBatchesExec::new(physical_plan, 50000)); - - // Check if plan contains filter and add FilterExec - fn has_filter(plan: &LogicalPlan) -> bool { - match plan { - LogicalPlan::Filter(_) => true, - LogicalPlan::Projection(proj) => has_filter(proj.input.as_ref()), - LogicalPlan::Aggregate(agg) => has_filter(agg.input.as_ref()), - LogicalPlan::Join(join) => { - has_filter(join.left.as_ref()) || has_filter(join.right.as_ref()) - }, - LogicalPlan::Window(window) => has_filter(window.input.as_ref()), - LogicalPlan::Sort(sort) => has_filter(sort.input.as_ref()), - LogicalPlan::Limit(limit) => has_filter(limit.input.as_ref()), - _ => false, - } - } - - // Extract filter expressions from logical plan - fn extract_filters(plan: &LogicalPlan) -> Vec { - match plan { - LogicalPlan::Filter(filter) => vec![filter.predicate.clone()], - LogicalPlan::Projection(proj) => extract_filters(proj.input.as_ref()), - LogicalPlan::Aggregate(agg) => extract_filters(agg.input.as_ref()), - LogicalPlan::Join(join) => { - let mut filters = extract_filters(join.left.as_ref()); - filters.extend(extract_filters(join.right.as_ref())); - filters - }, - _ => vec![], - } - } - - if has_filter(&logical_plan) { - let filters = extract_filters(&logical_plan); - for filter_expr in filters { - - - if let Ok(physical_filter_expr) = create_physical_expr( - &filter_expr, - &logical_plan.schema(), - &ctx.state().execution_props(), - - ) { - exec_plan = Arc::new(FilterExec::try_new( - physical_filter_expr, - exec_plan, - ).unwrap()); - } - - - } - } - - // Execute the plan + let exec_plan: Arc = Arc::new(CoalesceBatchesExec::new(physical_plan, 8192)); let task_ctx = ctx.task_ctx(); - let start = Instant::now(); - - //let _ = execute_parallel(exec_plan.clone(), ctx.task_ctx()).await.unwrap(); - // Add repartitioning for better parallelism - let repartitioned = Arc::new(RepartitionExec::try_new( - exec_plan, - Partitioning::RoundRobinBatch(8), - ).unwrap()); - let _ = PhysicalPlanCollect(repartitioned, task_ctx).await.unwrap(); + let repartitioned = Arc::new(RepartitionExec::try_new( + exec_plan, + Partitioning::RoundRobinBatch(8), + ).unwrap()); + let start = Instant::now(); + let _query_response = collect(repartitioned, task_ctx).await.unwrap(); let elapsed = start.elapsed().as_secs_f64(); - let benchmark_result = BenchmarkResult { - query_num, - iteration, - elapsed_seconds: elapsed, - }; + total_elapsed += elapsed; println!("Query {query_num} iteration {iteration} took {elapsed} seconds"); - results.push(benchmark_result); + } query_num += 1; } + println!("Total time: {total_elapsed} seconds"); } -fn clear_caches() -> io::Result<()> { - // Sync filesystems - Command::new("sync").status()?; +// // Check if plan contains filter and add FilterExec +// fn has_filter(plan: &LogicalPlan) -> bool { +// println!("Plan: {plan}"); +// match plan { +// LogicalPlan::Filter(_) => true, +// LogicalPlan::Projection(proj) => has_filter(proj.input.as_ref()), +// LogicalPlan::Aggregate(agg) => has_filter(agg.input.as_ref()), +// LogicalPlan::Join(join) => { +// has_filter(join.left.as_ref()) || has_filter(join.right.as_ref()) +// }, +// LogicalPlan::Window(window) => has_filter(window.input.as_ref()), +// LogicalPlan::Sort(sort) => has_filter(sort.input.as_ref()), +// LogicalPlan::Limit(limit) => has_filter(limit.input.as_ref()), +// _ => false, +// } +// } + +// Extract filter expressions from logical plan +// fn extract_filters(plan: &LogicalPlan) -> Vec { +// match plan { +// LogicalPlan::Filter(filter) => vec![filter.predicate.clone()], +// LogicalPlan::Projection(proj) => extract_filters(proj.input.as_ref()), +// LogicalPlan::Aggregate(agg) => extract_filters(agg.input.as_ref()), +// LogicalPlan::Join(join) => { +// let mut filters = extract_filters(join.left.as_ref()); +// filters.extend(extract_filters(join.right.as_ref())); +// filters +// }, +// LogicalPlan::Limit(limit) => extract_filters(limit.input.as_ref()), +// LogicalPlan::Sort(sort) => extract_filters(sort.input.as_ref()), +// _ => vec![], +// } +// } + +// fn clear_caches() -> io::Result<()> { +// // Sync filesystems +// Command::new("sync").status()?; - // Clear caches using sudo - Command::new("sudo") - .args(&["tee", "/proc/sys/vm/drop_caches"]) - .arg("3") - .output()?; +// // Clear caches using sudo +// Command::new("sudo") +// .args(&["tee", "/proc/sys/vm/drop_caches"]) +// .arg("3") +// .output()?; - Ok(()) -} +// Ok(()) +// } pub mod error { use crate::{metadata::error::stream_info::MetadataError, storage::ObjectStorageError}; use datafusion::error::DataFusionError; From 8a6e886e1f2a9e18d983b560651ca5ec4ea10623 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Mon, 17 Feb 2025 13:36:49 -0500 Subject: [PATCH 22/32] optimised --- src/query/mod.rs | 1822 ++++++++++++++++++++++++---------------------- 1 file changed, 934 insertions(+), 888 deletions(-) diff --git a/src/query/mod.rs b/src/query/mod.rs index b4995a8c2..d3d3ce4f4 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -16,745 +16,752 @@ * */ -pub mod catalog; -mod filter_optimizer; -pub mod functions; -mod listing_table_builder; -pub mod object_storage; -pub mod stream_schema_provider; - -// use catalog::DynamicObjectStoreCatalog; -use chrono::NaiveDateTime; -use chrono::{DateTime, Duration, Utc}; -use datafusion::arrow::record_batch::RecordBatch; - -use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeVisitor}; + pub mod catalog; + mod filter_optimizer; + pub mod functions; + mod listing_table_builder; + pub mod object_storage; + pub mod stream_schema_provider; + + use arrow_schema::SortOptions; + // use catalog::DynamicObjectStoreCatalog; + use chrono::NaiveDateTime; + use chrono::{DateTime, Duration, Utc}; + use datafusion::arrow::record_batch::RecordBatch; + + use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeVisitor}; + use datafusion::common::DFSchema; + use datafusion::config::Extensions; // use datafusion::common::{exec_datafusion_err, plan_err}; -// use datafusion::config::ConfigFileType; -use datafusion::error::{DataFusionError, Result}; -use datafusion::execution::disk_manager::DiskManagerConfig; -use datafusion::execution::runtime_env::RuntimeEnvBuilder; -// use datafusion::execution::runtime_env::RuntimeEnvBuilder; -use datafusion::execution::SessionStateBuilder; -use datafusion::logical_expr::expr::Alias; -use datafusion::logical_expr::{ - Aggregate, Explain, Filter, LogicalPlan, PlanType, Projection, ToStringifiedPlan, -}; -use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; -use datafusion::physical_plan::repartition::RepartitionExec; -use datafusion::physical_plan::{collect, ExecutionPlan, Partitioning}; -// use datafusion::physical_plan::execution_plan::EmissionType; -// use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties}; -use datafusion::prelude::*; -// use datafusion::sql::parser::DFParser; -use itertools::Itertools; -use once_cell::sync::Lazy; -use relative_path::RelativePathBuf; -use serde::{Deserialize, Serialize}; -use serde_json::{json, Value}; -use std::ops::Bound; -// use std::path::{Path, PathBuf}; -use std::sync::Arc; -use std::time::Instant; -use stream_schema_provider::collect_manifest_files; -use sysinfo::System; - -use std::fs; - -use self::error::ExecuteError; -use self::stream_schema_provider::GlobalSchemaProvider; -pub use self::stream_schema_provider::PartialTimeFilter; -use crate::catalog::column::{Int64Type, TypedStatistics}; -use crate::catalog::manifest::Manifest; -use crate::catalog::snapshot::Snapshot; -use crate::catalog::Snapshot as CatalogSnapshot; -use crate::event; -use crate::handlers::http::query::QueryError; -use crate::metadata::STREAM_INFO; -use crate::option::{Mode, CONFIG}; -use crate::storage::{ObjectStorageProvider, ObjectStoreFormat, STREAM_ROOT_DIRECTORY}; -use crate::utils::time::TimeRange; -pub static QUERY_SESSION: Lazy = - Lazy::new(|| Query::create_session_context(CONFIG.storage())); - -// A query request by client -#[derive(Debug)] -pub struct Query { - pub raw_logical_plan: LogicalPlan, - pub time_range: TimeRange, - pub filter_tag: Option>, -} - -impl Query { - // create session context for this query - pub fn create_session_context(storage: Arc) -> SessionContext { - let runtime_config = storage - .get_datafusion_runtime() - .with_disk_manager(DiskManagerConfig::NewOs); - - let (pool_size, fraction) = match CONFIG.options.query_memory_pool_size { - Some(size) => (size, 1.), - None => { - let mut system = System::new(); - system.refresh_memory(); - let available_mem = system.available_memory(); - (available_mem as usize, 0.85) - } - }; - - let runtime_config = runtime_config.with_memory_limit(pool_size, fraction); - let runtime = Arc::new(runtime_config.build().unwrap()); - - let mut config = SessionConfig::default() - .with_parquet_pruning(true) - //.with_prefer_existing_sort(true) - .with_round_robin_repartition(true); - - // For more details refer https://datafusion.apache.org/user-guide/configs.html - - // Reduce the number of rows read (if possible) - //config.options_mut().execution.parquet.enable_page_index = true; - - // Pushdown filters allows DF to push the filters as far down in the plan as possible - // and thus, reducing the number of rows decoded - config.options_mut().execution.parquet.pushdown_filters = true; - - // Reorder filters allows DF to decide the order of filters minimizing the cost of filter evaluation - // config.options_mut().execution.parquet.reorder_filters = true; - - // Enable StringViewArray - // https://www.influxdata.com/blog/faster-queries-with-stringview-part-one-influxdb/ - // config - // .options_mut() - // .execution - // .parquet - // .schema_force_view_types = true; - config.options_mut().execution.parquet.binary_as_string = true; - - let state = SessionStateBuilder::new() - .with_default_features() - .with_config(config) - .with_runtime_env(runtime) - .build(); - - let schema_provider = Arc::new(GlobalSchemaProvider { - storage: storage.get_object_store(), - }); - state - .catalog_list() - .catalog(&state.config_options().catalog.default_catalog) - .expect("default catalog is provided by datafusion") - .register_schema( - &state.config_options().catalog.default_schema, - schema_provider, - ) - .unwrap(); - - SessionContext::new_with_state(state) - } - - pub async fn execute( - &self, - stream_name: String, - ) -> Result<(Vec, Vec), ExecuteError> { - let time_partition = STREAM_INFO.get_time_partition(&stream_name)?; - - let df = QUERY_SESSION - .execute_logical_plan(self.final_logical_plan(&time_partition)) - .await?; - - let fields = df - .schema() - .fields() - .iter() - .map(|f| f.name()) - .cloned() - .collect_vec(); - - if fields.is_empty() { - return Ok((vec![], fields)); - } - - let results = df.collect().await?; - Ok((results, fields)) - } - - pub async fn get_dataframe(&self, stream_name: String) -> Result { - let time_partition = STREAM_INFO.get_time_partition(&stream_name)?; - - let df = QUERY_SESSION - .execute_logical_plan(self.final_logical_plan(&time_partition)) - .await?; - - Ok(df) - } - - /// return logical plan with all time filters applied through - fn final_logical_plan(&self, time_partition: &Option) -> LogicalPlan { - // see https://github.com/apache/arrow-datafusion/pull/8400 - // this can be eliminated in later version of datafusion but with slight caveat - // transform cannot modify stringified plans by itself - // we by knowing this plan is not in the optimization procees chose to overwrite the stringified plan - - match self.raw_logical_plan.clone() { - LogicalPlan::Explain(plan) => { - let transformed = transform( - plan.plan.as_ref().clone(), - self.time_range.start.naive_utc(), - self.time_range.end.naive_utc(), - time_partition, - ); - LogicalPlan::Explain(Explain { - verbose: plan.verbose, - stringified_plans: vec![transformed - .data - .to_stringified(PlanType::InitialLogicalPlan)], - plan: Arc::new(transformed.data), - schema: plan.schema, - logical_optimization_succeeded: plan.logical_optimization_succeeded, - }) - } - x => { - transform( - x, - self.time_range.start.naive_utc(), - self.time_range.end.naive_utc(), - time_partition, - ) - .data - } - } - } - - pub fn first_table_name(&self) -> Option { - let mut visitor = TableScanVisitor::default(); - let _ = self.raw_logical_plan.visit(&mut visitor); - visitor.into_inner().pop() - } - - /// Evaluates to Some("count(*)") | Some("column_name") if the logical plan is a Projection: SELECT COUNT(*) | SELECT COUNT(*) as column_name - pub fn is_logical_plan_count_without_filters(&self) -> Option<&String> { - // Check if the raw logical plan is a Projection: SELECT - let LogicalPlan::Projection(Projection { input, expr, .. }) = &self.raw_logical_plan else { - return None; - }; - // Check if the input of the Projection is an Aggregate: COUNT(*) - let LogicalPlan::Aggregate(Aggregate { input, .. }) = &**input else { - return None; - }; - - // Ensure the input of the Aggregate is a TableScan and there is exactly one expression: SELECT COUNT(*) - if !matches!(&**input, LogicalPlan::TableScan { .. }) || expr.len() != 1 { - return None; - } - - // Check if the expression is a column or an alias for COUNT(*) - match &expr[0] { - // Direct column check - Expr::Column(Column { name, .. }) if name.to_lowercase() == "count(*)" => Some(name), - // Alias for COUNT(*) - Expr::Alias(Alias { - expr: inner_expr, - name: alias_name, - .. - }) => { - if let Expr::Column(Column { name, .. }) = &**inner_expr { - if name.to_lowercase() == "count(*)" { - return Some(alias_name); - } - } - None - } - // Unsupported expression type - _ => None, - } - } -} - -/// Record of counts for a given time bin. -#[derive(Debug, Serialize, Clone)] -pub struct CountsRecord { - /// Start time of the bin - pub start_time: String, - /// End time of the bin - pub end_time: String, - /// Number of logs in the bin - pub count: u64, -} - -struct TimeBounds { - start: DateTime, - end: DateTime, -} - -/// Request for counts, received from API/SQL query. -#[derive(Debug, Deserialize, Clone)] -#[serde(rename_all = "camelCase")] -pub struct CountsRequest { - /// Name of the stream to get counts for - pub stream: String, - /// Included start time for counts query - pub start_time: String, - /// Excluded end time for counts query - pub end_time: String, - /// Number of bins to divide the time range into - pub num_bins: u64, -} - -impl CountsRequest { - /// This function is supposed to read maninfest files for the given stream, - /// get the sum of `num_rows` between the `startTime` and `endTime`, - /// divide that by number of bins and return in a manner acceptable for the console - pub async fn get_bin_density(&self) -> Result, QueryError> { - let time_partition = STREAM_INFO - .get_time_partition(&self.stream.clone()) - .map_err(|err| anyhow::Error::msg(err.to_string()))? - .unwrap_or_else(|| event::DEFAULT_TIMESTAMP_KEY.to_owned()); - - // get time range - let time_range = TimeRange::parse_human_time(&self.start_time, &self.end_time)?; - let all_manifest_files = get_manifest_list(&self.stream, &time_range).await?; - // get bounds - let counts = self.get_bounds(&time_range); - - // we have start and end times for each bin - // we also have all the manifest files for the given time range - // now we iterate over start and end times for each bin - // then we iterate over the manifest files which are within that time range - // we sum up the num_rows - let mut counts_records = Vec::new(); - - for bin in counts { - // extract start and end time to compare - // Sum up the number of rows that fall within the bin - let count: u64 = all_manifest_files - .iter() - .flat_map(|m| &m.files) - .filter_map(|f| { - if f.columns.iter().any(|c| { - c.name == time_partition - && c.stats.as_ref().is_some_and(|stats| match stats { - TypedStatistics::Int(Int64Type { min, .. }) => { - let min = DateTime::from_timestamp_millis(*min).unwrap(); - bin.start <= min && bin.end >= min // Determines if a column matches the bin's time range. - } - _ => false, - }) - }) { - Some(f.num_rows) - } else { - None - } - }) - .sum(); - - counts_records.push(CountsRecord { - start_time: bin.start.to_rfc3339(), - end_time: bin.end.to_rfc3339(), - count, - }); - } - Ok(counts_records) - } - - /// Calculate the end time for each bin based on the number of bins - fn get_bounds(&self, time_range: &TimeRange) -> Vec { - let total_minutes = time_range - .end - .signed_duration_since(time_range.start) - .num_minutes() as u64; - - // divide minutes by num bins to get minutes per bin - let quotient = total_minutes / self.num_bins; - let remainder = total_minutes % self.num_bins; - let have_remainder = remainder > 0; - - // now create multiple bounds [startTime, endTime) - // Should we exclude the last one??? - let mut bounds = vec![]; - - let mut start = time_range.start; - - let loop_end = if have_remainder { - self.num_bins - } else { - self.num_bins - 1 - }; - - // Create bins for all but the last date - for _ in 0..loop_end { - let end = start + Duration::minutes(quotient as i64); - bounds.push(TimeBounds { start, end }); - start = end; - } - - // Add the last bin, accounting for any remainder, should we include it? - if have_remainder { - bounds.push(TimeBounds { - start, - end: start + Duration::minutes(remainder as i64), - }); - } else { - bounds.push(TimeBounds { - start, - end: start + Duration::minutes(quotient as i64), - }); - } - - bounds - } -} - -/// Response for the counts API -#[derive(Debug, Serialize, Clone)] -pub struct CountsResponse { - /// Fields in the log stream - pub fields: Vec, - /// Records in the response - pub records: Vec, -} - -#[derive(Debug, Default)] -pub struct TableScanVisitor { - tables: Vec, -} - -impl TableScanVisitor { - pub fn into_inner(self) -> Vec { - self.tables - } -} - -impl TreeNodeVisitor<'_> for TableScanVisitor { - type Node = LogicalPlan; - - fn f_down(&mut self, node: &Self::Node) -> Result { - match node { - LogicalPlan::TableScan(table) => { - self.tables.push(table.table_name.table().to_string()); - Ok(TreeNodeRecursion::Jump) - } - _ => Ok(TreeNodeRecursion::Continue), - } - } -} - -pub async fn get_manifest_list( - stream_name: &str, - time_range: &TimeRange, -) -> Result, QueryError> { - let glob_storage = CONFIG.storage().get_object_store(); - - let object_store = QUERY_SESSION - .state() - .runtime_env() - .object_store_registry - .get_store(&glob_storage.store_url()) - .unwrap(); - - // get object store - let object_store_format = glob_storage - .get_object_store_format(stream_name) - .await - .map_err(|err| DataFusionError::Plan(err.to_string()))?; - - // all the manifests will go here - let mut merged_snapshot: Snapshot = Snapshot::default(); - - // get a list of manifests - if CONFIG.options.mode == Mode::Query { - let path = RelativePathBuf::from_iter([stream_name, STREAM_ROOT_DIRECTORY]); - let obs = glob_storage - .get_objects( - Some(&path), - Box::new(|file_name| file_name.ends_with("stream.json")), - ) - .await; - if let Ok(obs) = obs { - for ob in obs { - if let Ok(object_store_format) = serde_json::from_slice::(&ob) { - let snapshot = object_store_format.snapshot; - for manifest in snapshot.manifest_list { - merged_snapshot.manifest_list.push(manifest); - } - } - } - } - } else { - merged_snapshot = object_store_format.snapshot; - } - - // Download all the manifest files - let time_filter = [ - PartialTimeFilter::Low(Bound::Included(time_range.start.naive_utc())), - PartialTimeFilter::High(Bound::Included(time_range.end.naive_utc())), - ]; - - let all_manifest_files = collect_manifest_files( - object_store, - merged_snapshot - .manifests(&time_filter) - .into_iter() - .sorted_by_key(|file| file.time_lower_bound) - .map(|item| item.manifest_path) - .collect(), - ) - .await - .map_err(|err| anyhow::Error::msg(err.to_string()))?; - - Ok(all_manifest_files) -} - -fn transform( - plan: LogicalPlan, - start_time: NaiveDateTime, - end_time: NaiveDateTime, - time_partition: &Option, -) -> Transformed { - plan.transform(&|plan| match plan { - LogicalPlan::TableScan(table) => { - let new_filters = vec![]; - if !table_contains_any_time_filters(&table, time_partition) { - let mut _start_time_filter: Expr; - let mut _end_time_filter: Expr; - match time_partition { - Some(time_partition) => { - _start_time_filter = - PartialTimeFilter::Low(std::ops::Bound::Included(start_time)) - .binary_expr(Expr::Column(Column::new( - Some(table.table_name.to_owned()), - time_partition.clone(), - ))); - _end_time_filter = - PartialTimeFilter::High(std::ops::Bound::Excluded(end_time)) - .binary_expr(Expr::Column(Column::new( - Some(table.table_name.to_owned()), - time_partition, - ))); - } - None => { - _start_time_filter = - PartialTimeFilter::Low(std::ops::Bound::Included(start_time)) - .binary_expr(Expr::Column(Column::new( - Some(table.table_name.to_owned()), - event::DEFAULT_TIMESTAMP_KEY, - ))); - _end_time_filter = - PartialTimeFilter::High(std::ops::Bound::Excluded(end_time)) - .binary_expr(Expr::Column(Column::new( - Some(table.table_name.to_owned()), - event::DEFAULT_TIMESTAMP_KEY, - ))); - } - } - - //new_filters.push(_start_time_filter); - //new_filters.push(_end_time_filter); - } - let new_filter = new_filters.into_iter().reduce(and); - if let Some(new_filter) = new_filter { - let filter = - Filter::try_new(new_filter, Arc::new(LogicalPlan::TableScan(table))).unwrap(); - Ok(Transformed::yes(LogicalPlan::Filter(filter))) - } else { - Ok(Transformed::no(LogicalPlan::TableScan(table))) - } - } - x => Ok(Transformed::no(x)), - }) - .expect("transform only transforms the tablescan") -} - -fn table_contains_any_time_filters( - table: &datafusion::logical_expr::TableScan, - time_partition: &Option, -) -> bool { - table - .filters - .iter() - .filter_map(|x| { - if let Expr::BinaryExpr(binexpr) = x { - Some(binexpr) - } else { - None - } - }) - .any(|expr| { - matches!(&*expr.left, Expr::Column(Column { name, .. }) - if ((time_partition.is_some() && name == time_partition.as_ref().unwrap()) || - (!time_partition.is_some() && name == event::DEFAULT_TIMESTAMP_KEY))) - }) -} - -/// unused for now might need it later -#[allow(unused)] -pub fn flatten_objects_for_count(objects: Vec) -> Vec { - if objects.is_empty() { - return objects; - } - - // check if all the keys start with "COUNT" - let flag = objects.iter().all(|obj| { - obj.as_object() - .unwrap() - .keys() - .all(|key| key.starts_with("COUNT")) - }) && objects.iter().all(|obj| { - obj.as_object() - .unwrap() - .keys() - .all(|key| key == objects[0].as_object().unwrap().keys().next().unwrap()) - }); - - if flag { - let mut accum = 0u64; - let key = objects[0] - .as_object() - .unwrap() - .keys() - .next() - .unwrap() - .clone(); - - for obj in objects { - let count = obj.as_object().unwrap().keys().fold(0, |acc, key| { - let value = obj.as_object().unwrap().get(key).unwrap().as_u64().unwrap(); - acc + value - }); - accum += count; - } - - vec![json!({ - key: accum - })] - } else { - objects - } -} - -// struct AllQueries { -// queries: Vec, -// } - -// impl AllQueries { -// fn try_new(path: &Path) -> Result { -// // ClickBench has all queries in a single file identified by line number -// let all_queries = std::fs::read_to_string(path) -// .map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?; -// Ok(Self { -// queries: all_queries.lines().map(|s| s.to_string()).collect(), -// }) -// } - -// /// Returns the text of query `query_id` -// fn get_query(&self, query_id: usize) -> Result<&str> { -// self.queries -// .get(query_id) -// .ok_or_else(|| { -// let min_id = self.min_query_id(); -// let max_id = self.max_query_id(); -// exec_datafusion_err!( -// "Invalid query id {query_id}. Must be between {min_id} and {max_id}" -// ) -// }) -// .map(|s| s.as_str()) -// } - -// fn min_query_id(&self) -> usize { -// 0 -// } - -// fn max_query_id(&self) -> usize { -// self.queries.len() - 1 -// } -// } - -// pub async fn run() -> Result<()> { -// let rt_config = RuntimeEnvBuilder::new(); -// let runtime_env = rt_config.build().unwrap(); -// println!("Running benchmarks"); -// let queries_path: PathBuf = ["/home", "ubuntu", "queries.sql"] -// .iter() -// .collect(); -// let queries = AllQueries::try_new(queries_path.as_path())?; -// println!("queries loaded"); -// let query_range = queries.min_query_id()..=queries.max_query_id(); - -// // configure parquet options -// let mut config = SessionConfig::new() -// .with_parquet_pruning(true) -// .with_target_partitions(num_cpus::get()) -// .with_coalesce_batches(true) -// .with_collect_statistics(true) -// .with_parquet_page_index_pruning(true); -// config.options_mut().execution.parquet.binary_as_string = true; -// config.options_mut().execution.parquet.pushdown_filters = true; -// config.options_mut().execution.parquet.reorder_filters = true; -// config -// .options_mut() -// .execution -// .use_row_number_estimates_to_optimize_partitioning = true; -// // enable dynamic file query -// let ctx = SessionContext::new_with_config_rt(config, Arc::new(runtime_env)).enable_url_table(); -// ctx.refresh_catalogs().await?; -// // install dynamic catalog provider that can register required object stores -// ctx.register_catalog_list(Arc::new(DynamicObjectStoreCatalog::new( -// ctx.state().catalog_list().clone(), -// ctx.state_weak_ref(), -// ))); - -// let sql = "CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '/home/ubuntu/clickbench/hits.parquet' OPTIONS ('binary_as_string' 'true')"; -// let task_ctx = ctx.task_ctx(); -// let dialect = &task_ctx.session_config().options().sql_parser.dialect; -// let dialect = sqlparser::dialect::dialect_from_str(dialect).unwrap(); -// let plan = ctx.state().create_logical_plan(sql).await?; -// if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { -// let format = Some(ConfigFileType::PARQUET); -// // Clone and modify the default table options based on the provided options -// let mut table_options = ctx.state().default_table_options(); -// if let Some(format) = format { -// table_options.set_config_format(format); -// } -// table_options.alter_with_string_hash_map(&cmd.options)?; - -// ctx.sql(&sql).await?; -// for query_id in query_range { -// let sql = queries.get_query(query_id)?; -// let statements = DFParser::parse_sql_with_dialect(&sql, dialect.as_ref())?; -// for statement in statements { -// let plan = ctx.state().statement_to_plan(statement).await?; -// let df = ctx.execute_logical_plan(plan).await?; -// let physical_plan = df.create_physical_plan().await?; -// if physical_plan.boundedness().is_unbounded() { -// if physical_plan.pipeline_behavior() == EmissionType::Final { -// return plan_err!( -// "The given query can generate a valid result only once \ -// the source finishes, but the source is unbounded" -// ); -// } -// // As the input stream comes, we can generate results. -// // However, memory safety is not guaranteed. -// let start = Instant::now(); -// let _ = execute_stream(physical_plan, task_ctx.clone())?; -// let elapsed = start.elapsed().as_secs_f64(); -// println!("Query{query_id} took {elapsed} seconds"); -// } else { -// // Bounded stream; collected results are printed after all input consumed. -// let start = Instant::now(); -// let _ = collect(physical_plan, task_ctx.clone()).await?; -// let elapsed = start.elapsed().as_secs_f64(); -// println!("Q{query_id} took {elapsed} seconds"); -// } -// } -// } -// } else { -// return plan_err!("LogicalPlan is not a CreateExternalTable"); -// } - -// Ok(()) -// } - -pub async fn run_benchmark() { + // use datafusion::config::ConfigFileType; + use datafusion::error::{DataFusionError, Result}; + use datafusion::execution::disk_manager::DiskManagerConfig; + use datafusion::execution::runtime_env::RuntimeEnvBuilder; + // use datafusion::execution::runtime_env::RuntimeEnvBuilder; + use datafusion::execution::{SessionState, SessionStateBuilder}; + use datafusion::logical_expr::expr::Alias; + use datafusion::logical_expr::{ + Aggregate, Explain, Filter, LogicalPlan, PlanType, Projection, ToStringifiedPlan, + }; + use datafusion::physical_expr::{create_physical_expr, LexOrdering, PhysicalSortExpr}; + use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; + use datafusion::physical_plan::filter::FilterExec; + use datafusion::physical_plan::repartition::RepartitionExec; + use datafusion::physical_plan::sorts::sort::SortExec; + use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; + use datafusion::physical_plan::{collect, ExecutionPlan, Partitioning}; + // use datafusion::physical_plan::execution_plan::EmissionType; + // use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties}; + use datafusion::prelude::*; + // use datafusion::sql::parser::DFParser; + use itertools::Itertools; + use once_cell::sync::Lazy; + use relative_path::RelativePathBuf; + use serde::{Deserialize, Serialize}; + use serde_json::{json, Value}; + use std::ops::Bound; + // use std::path::{Path, PathBuf}; + use std::sync::Arc; + use std::time::Instant; + use stream_schema_provider::collect_manifest_files; + use sysinfo::System; + + use std::fs; + + use self::error::ExecuteError; + use self::stream_schema_provider::GlobalSchemaProvider; + pub use self::stream_schema_provider::PartialTimeFilter; + use crate::catalog::column::{Int64Type, TypedStatistics}; + use crate::catalog::manifest::Manifest; + use crate::catalog::snapshot::Snapshot; + use crate::catalog::Snapshot as CatalogSnapshot; + use crate::event; + use crate::handlers::http::query::QueryError; + use crate::metadata::STREAM_INFO; + use crate::option::{Mode, CONFIG}; + use crate::storage::{ObjectStorageProvider, ObjectStoreFormat, STREAM_ROOT_DIRECTORY}; + use crate::utils::time::TimeRange; + pub static QUERY_SESSION: Lazy = + Lazy::new(|| Query::create_session_context(CONFIG.storage())); + + // A query request by client + #[derive(Debug)] + pub struct Query { + pub raw_logical_plan: LogicalPlan, + pub time_range: TimeRange, + pub filter_tag: Option>, + } + + impl Query { + // create session context for this query + pub fn create_session_context(storage: Arc) -> SessionContext { + let runtime_config = storage + .get_datafusion_runtime() + .with_disk_manager(DiskManagerConfig::NewOs); + + let (pool_size, fraction) = match CONFIG.options.query_memory_pool_size { + Some(size) => (size, 1.), + None => { + let mut system = System::new(); + system.refresh_memory(); + let available_mem = system.available_memory(); + (available_mem as usize, 0.85) + } + }; + + let runtime_config = runtime_config.with_memory_limit(pool_size, fraction); + let runtime = Arc::new(runtime_config.build().unwrap()); + + let mut config = SessionConfig::default() + .with_parquet_pruning(true) + //.with_prefer_existing_sort(true) + .with_round_robin_repartition(true); + + // For more details refer https://datafusion.apache.org/user-guide/configs.html + + // Reduce the number of rows read (if possible) + //config.options_mut().execution.parquet.enable_page_index = true; + + // Pushdown filters allows DF to push the filters as far down in the plan as possible + // and thus, reducing the number of rows decoded + config.options_mut().execution.parquet.pushdown_filters = true; + + // Reorder filters allows DF to decide the order of filters minimizing the cost of filter evaluation + // config.options_mut().execution.parquet.reorder_filters = true; + + // Enable StringViewArray + // https://www.influxdata.com/blog/faster-queries-with-stringview-part-one-influxdb/ + // config + // .options_mut() + // .execution + // .parquet + // .schema_force_view_types = true; + config.options_mut().execution.parquet.binary_as_string = true; + + let state = SessionStateBuilder::new() + .with_default_features() + .with_config(config) + .with_runtime_env(runtime) + .build(); + + let schema_provider = Arc::new(GlobalSchemaProvider { + storage: storage.get_object_store(), + }); + state + .catalog_list() + .catalog(&state.config_options().catalog.default_catalog) + .expect("default catalog is provided by datafusion") + .register_schema( + &state.config_options().catalog.default_schema, + schema_provider, + ) + .unwrap(); + + SessionContext::new_with_state(state) + } + + pub async fn execute( + &self, + stream_name: String, + ) -> Result<(Vec, Vec), ExecuteError> { + let time_partition = STREAM_INFO.get_time_partition(&stream_name)?; + + let df = QUERY_SESSION + .execute_logical_plan(self.final_logical_plan(&time_partition)) + .await?; + + let fields = df + .schema() + .fields() + .iter() + .map(|f| f.name()) + .cloned() + .collect_vec(); + + if fields.is_empty() { + return Ok((vec![], fields)); + } + + let results = df.collect().await?; + Ok((results, fields)) + } + + pub async fn get_dataframe(&self, stream_name: String) -> Result { + let time_partition = STREAM_INFO.get_time_partition(&stream_name)?; + + let df = QUERY_SESSION + .execute_logical_plan(self.final_logical_plan(&time_partition)) + .await?; + + Ok(df) + } + + /// return logical plan with all time filters applied through + fn final_logical_plan(&self, time_partition: &Option) -> LogicalPlan { + // see https://github.com/apache/arrow-datafusion/pull/8400 + // this can be eliminated in later version of datafusion but with slight caveat + // transform cannot modify stringified plans by itself + // we by knowing this plan is not in the optimization procees chose to overwrite the stringified plan + + match self.raw_logical_plan.clone() { + LogicalPlan::Explain(plan) => { + let transformed = transform( + plan.plan.as_ref().clone(), + self.time_range.start.naive_utc(), + self.time_range.end.naive_utc(), + time_partition, + ); + LogicalPlan::Explain(Explain { + verbose: plan.verbose, + stringified_plans: vec![transformed + .data + .to_stringified(PlanType::InitialLogicalPlan)], + plan: Arc::new(transformed.data), + schema: plan.schema, + logical_optimization_succeeded: plan.logical_optimization_succeeded, + }) + } + x => { + transform( + x, + self.time_range.start.naive_utc(), + self.time_range.end.naive_utc(), + time_partition, + ) + .data + } + } + } + + pub fn first_table_name(&self) -> Option { + let mut visitor = TableScanVisitor::default(); + let _ = self.raw_logical_plan.visit(&mut visitor); + visitor.into_inner().pop() + } + + /// Evaluates to Some("count(*)") | Some("column_name") if the logical plan is a Projection: SELECT COUNT(*) | SELECT COUNT(*) as column_name + pub fn is_logical_plan_count_without_filters(&self) -> Option<&String> { + // Check if the raw logical plan is a Projection: SELECT + let LogicalPlan::Projection(Projection { input, expr, .. }) = &self.raw_logical_plan else { + return None; + }; + // Check if the input of the Projection is an Aggregate: COUNT(*) + let LogicalPlan::Aggregate(Aggregate { input, .. }) = &**input else { + return None; + }; + + // Ensure the input of the Aggregate is a TableScan and there is exactly one expression: SELECT COUNT(*) + if !matches!(&**input, LogicalPlan::TableScan { .. }) || expr.len() != 1 { + return None; + } + + // Check if the expression is a column or an alias for COUNT(*) + match &expr[0] { + // Direct column check + Expr::Column(Column { name, .. }) if name.to_lowercase() == "count(*)" => Some(name), + // Alias for COUNT(*) + Expr::Alias(Alias { + expr: inner_expr, + name: alias_name, + .. + }) => { + if let Expr::Column(Column { name, .. }) = &**inner_expr { + if name.to_lowercase() == "count(*)" { + return Some(alias_name); + } + } + None + } + // Unsupported expression type + _ => None, + } + } + } + + /// Record of counts for a given time bin. + #[derive(Debug, Serialize, Clone)] + pub struct CountsRecord { + /// Start time of the bin + pub start_time: String, + /// End time of the bin + pub end_time: String, + /// Number of logs in the bin + pub count: u64, + } + + struct TimeBounds { + start: DateTime, + end: DateTime, + } + + /// Request for counts, received from API/SQL query. + #[derive(Debug, Deserialize, Clone)] + #[serde(rename_all = "camelCase")] + pub struct CountsRequest { + /// Name of the stream to get counts for + pub stream: String, + /// Included start time for counts query + pub start_time: String, + /// Excluded end time for counts query + pub end_time: String, + /// Number of bins to divide the time range into + pub num_bins: u64, + } + + impl CountsRequest { + /// This function is supposed to read maninfest files for the given stream, + /// get the sum of `num_rows` between the `startTime` and `endTime`, + /// divide that by number of bins and return in a manner acceptable for the console + pub async fn get_bin_density(&self) -> Result, QueryError> { + let time_partition = STREAM_INFO + .get_time_partition(&self.stream.clone()) + .map_err(|err| anyhow::Error::msg(err.to_string()))? + .unwrap_or_else(|| event::DEFAULT_TIMESTAMP_KEY.to_owned()); + + // get time range + let time_range = TimeRange::parse_human_time(&self.start_time, &self.end_time)?; + let all_manifest_files = get_manifest_list(&self.stream, &time_range).await?; + // get bounds + let counts = self.get_bounds(&time_range); + + // we have start and end times for each bin + // we also have all the manifest files for the given time range + // now we iterate over start and end times for each bin + // then we iterate over the manifest files which are within that time range + // we sum up the num_rows + let mut counts_records = Vec::new(); + + for bin in counts { + // extract start and end time to compare + // Sum up the number of rows that fall within the bin + let count: u64 = all_manifest_files + .iter() + .flat_map(|m| &m.files) + .filter_map(|f| { + if f.columns.iter().any(|c| { + c.name == time_partition + && c.stats.as_ref().is_some_and(|stats| match stats { + TypedStatistics::Int(Int64Type { min, .. }) => { + let min = DateTime::from_timestamp_millis(*min).unwrap(); + bin.start <= min && bin.end >= min // Determines if a column matches the bin's time range. + } + _ => false, + }) + }) { + Some(f.num_rows) + } else { + None + } + }) + .sum(); + + counts_records.push(CountsRecord { + start_time: bin.start.to_rfc3339(), + end_time: bin.end.to_rfc3339(), + count, + }); + } + Ok(counts_records) + } + + /// Calculate the end time for each bin based on the number of bins + fn get_bounds(&self, time_range: &TimeRange) -> Vec { + let total_minutes = time_range + .end + .signed_duration_since(time_range.start) + .num_minutes() as u64; + + // divide minutes by num bins to get minutes per bin + let quotient = total_minutes / self.num_bins; + let remainder = total_minutes % self.num_bins; + let have_remainder = remainder > 0; + + // now create multiple bounds [startTime, endTime) + // Should we exclude the last one??? + let mut bounds = vec![]; + + let mut start = time_range.start; + + let loop_end = if have_remainder { + self.num_bins + } else { + self.num_bins - 1 + }; + + // Create bins for all but the last date + for _ in 0..loop_end { + let end = start + Duration::minutes(quotient as i64); + bounds.push(TimeBounds { start, end }); + start = end; + } + + // Add the last bin, accounting for any remainder, should we include it? + if have_remainder { + bounds.push(TimeBounds { + start, + end: start + Duration::minutes(remainder as i64), + }); + } else { + bounds.push(TimeBounds { + start, + end: start + Duration::minutes(quotient as i64), + }); + } + + bounds + } + } + + /// Response for the counts API + #[derive(Debug, Serialize, Clone)] + pub struct CountsResponse { + /// Fields in the log stream + pub fields: Vec, + /// Records in the response + pub records: Vec, + } + + #[derive(Debug, Default)] + pub struct TableScanVisitor { + tables: Vec, + } + + impl TableScanVisitor { + pub fn into_inner(self) -> Vec { + self.tables + } + } + + impl TreeNodeVisitor<'_> for TableScanVisitor { + type Node = LogicalPlan; + + fn f_down(&mut self, node: &Self::Node) -> Result { + match node { + LogicalPlan::TableScan(table) => { + self.tables.push(table.table_name.table().to_string()); + Ok(TreeNodeRecursion::Jump) + } + _ => Ok(TreeNodeRecursion::Continue), + } + } + } + + pub async fn get_manifest_list( + stream_name: &str, + time_range: &TimeRange, + ) -> Result, QueryError> { + let glob_storage = CONFIG.storage().get_object_store(); + + let object_store = QUERY_SESSION + .state() + .runtime_env() + .object_store_registry + .get_store(&glob_storage.store_url()) + .unwrap(); + + // get object store + let object_store_format = glob_storage + .get_object_store_format(stream_name) + .await + .map_err(|err| DataFusionError::Plan(err.to_string()))?; + + // all the manifests will go here + let mut merged_snapshot: Snapshot = Snapshot::default(); + + // get a list of manifests + if CONFIG.options.mode == Mode::Query { + let path = RelativePathBuf::from_iter([stream_name, STREAM_ROOT_DIRECTORY]); + let obs = glob_storage + .get_objects( + Some(&path), + Box::new(|file_name| file_name.ends_with("stream.json")), + ) + .await; + if let Ok(obs) = obs { + for ob in obs { + if let Ok(object_store_format) = serde_json::from_slice::(&ob) { + let snapshot = object_store_format.snapshot; + for manifest in snapshot.manifest_list { + merged_snapshot.manifest_list.push(manifest); + } + } + } + } + } else { + merged_snapshot = object_store_format.snapshot; + } + + // Download all the manifest files + let time_filter = [ + PartialTimeFilter::Low(Bound::Included(time_range.start.naive_utc())), + PartialTimeFilter::High(Bound::Included(time_range.end.naive_utc())), + ]; + + let all_manifest_files = collect_manifest_files( + object_store, + merged_snapshot + .manifests(&time_filter) + .into_iter() + .sorted_by_key(|file| file.time_lower_bound) + .map(|item| item.manifest_path) + .collect(), + ) + .await + .map_err(|err| anyhow::Error::msg(err.to_string()))?; + + Ok(all_manifest_files) + } + + fn transform( + plan: LogicalPlan, + start_time: NaiveDateTime, + end_time: NaiveDateTime, + time_partition: &Option, + ) -> Transformed { + plan.transform(&|plan| match plan { + LogicalPlan::TableScan(table) => { + let new_filters = vec![]; + if !table_contains_any_time_filters(&table, time_partition) { + let mut _start_time_filter: Expr; + let mut _end_time_filter: Expr; + match time_partition { + Some(time_partition) => { + _start_time_filter = + PartialTimeFilter::Low(std::ops::Bound::Included(start_time)) + .binary_expr(Expr::Column(Column::new( + Some(table.table_name.to_owned()), + time_partition.clone(), + ))); + _end_time_filter = + PartialTimeFilter::High(std::ops::Bound::Excluded(end_time)) + .binary_expr(Expr::Column(Column::new( + Some(table.table_name.to_owned()), + time_partition, + ))); + } + None => { + _start_time_filter = + PartialTimeFilter::Low(std::ops::Bound::Included(start_time)) + .binary_expr(Expr::Column(Column::new( + Some(table.table_name.to_owned()), + event::DEFAULT_TIMESTAMP_KEY, + ))); + _end_time_filter = + PartialTimeFilter::High(std::ops::Bound::Excluded(end_time)) + .binary_expr(Expr::Column(Column::new( + Some(table.table_name.to_owned()), + event::DEFAULT_TIMESTAMP_KEY, + ))); + } + } + + //new_filters.push(_start_time_filter); + //new_filters.push(_end_time_filter); + } + let new_filter = new_filters.into_iter().reduce(and); + if let Some(new_filter) = new_filter { + let filter = + Filter::try_new(new_filter, Arc::new(LogicalPlan::TableScan(table))).unwrap(); + Ok(Transformed::yes(LogicalPlan::Filter(filter))) + } else { + Ok(Transformed::no(LogicalPlan::TableScan(table))) + } + } + x => Ok(Transformed::no(x)), + }) + .expect("transform only transforms the tablescan") + } + + fn table_contains_any_time_filters( + table: &datafusion::logical_expr::TableScan, + time_partition: &Option, + ) -> bool { + table + .filters + .iter() + .filter_map(|x| { + if let Expr::BinaryExpr(binexpr) = x { + Some(binexpr) + } else { + None + } + }) + .any(|expr| { + matches!(&*expr.left, Expr::Column(Column { name, .. }) + if ((time_partition.is_some() && name == time_partition.as_ref().unwrap()) || + (!time_partition.is_some() && name == event::DEFAULT_TIMESTAMP_KEY))) + }) + } + + /// unused for now might need it later + #[allow(unused)] + pub fn flatten_objects_for_count(objects: Vec) -> Vec { + if objects.is_empty() { + return objects; + } + + // check if all the keys start with "COUNT" + let flag = objects.iter().all(|obj| { + obj.as_object() + .unwrap() + .keys() + .all(|key| key.starts_with("COUNT")) + }) && objects.iter().all(|obj| { + obj.as_object() + .unwrap() + .keys() + .all(|key| key == objects[0].as_object().unwrap().keys().next().unwrap()) + }); + + if flag { + let mut accum = 0u64; + let key = objects[0] + .as_object() + .unwrap() + .keys() + .next() + .unwrap() + .clone(); + + for obj in objects { + let count = obj.as_object().unwrap().keys().fold(0, |acc, key| { + let value = obj.as_object().unwrap().get(key).unwrap().as_u64().unwrap(); + acc + value + }); + accum += count; + } + + vec![json!({ + key: accum + })] + } else { + objects + } + } + + // struct AllQueries { + // queries: Vec, + // } + + // impl AllQueries { + // fn try_new(path: &Path) -> Result { + // // ClickBench has all queries in a single file identified by line number + // let all_queries = std::fs::read_to_string(path) + // .map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?; + // Ok(Self { + // queries: all_queries.lines().map(|s| s.to_string()).collect(), + // }) + // } + + // /// Returns the text of query `query_id` + // fn get_query(&self, query_id: usize) -> Result<&str> { + // self.queries + // .get(query_id) + // .ok_or_else(|| { + // let min_id = self.min_query_id(); + // let max_id = self.max_query_id(); + // exec_datafusion_err!( + // "Invalid query id {query_id}. Must be between {min_id} and {max_id}" + // ) + // }) + // .map(|s| s.as_str()) + // } + + // fn min_query_id(&self) -> usize { + // 0 + // } + + // fn max_query_id(&self) -> usize { + // self.queries.len() - 1 + // } + // } + + // pub async fn run() -> Result<()> { + // let rt_config = RuntimeEnvBuilder::new(); + // let runtime_env = rt_config.build().unwrap(); + // println!("Running benchmarks"); + // let queries_path: PathBuf = ["/home", "ubuntu", "queries.sql"] + // .iter() + // .collect(); + // let queries = AllQueries::try_new(queries_path.as_path())?; + // println!("queries loaded"); + // let query_range = queries.min_query_id()..=queries.max_query_id(); + + // // configure parquet options + // let mut config = SessionConfig::new() + // .with_parquet_pruning(true) + // .with_target_partitions(num_cpus::get()) + // .with_coalesce_batches(true) + // .with_collect_statistics(true) + // .with_parquet_page_index_pruning(true); + // config.options_mut().execution.parquet.binary_as_string = true; + // config.options_mut().execution.parquet.pushdown_filters = true; + // config.options_mut().execution.parquet.reorder_filters = true; + // config + // .options_mut() + // .execution + // .use_row_number_estimates_to_optimize_partitioning = true; + // // enable dynamic file query + // let ctx = SessionContext::new_with_config_rt(config, Arc::new(runtime_env)).enable_url_table(); + // ctx.refresh_catalogs().await?; + // // install dynamic catalog provider that can register required object stores + // ctx.register_catalog_list(Arc::new(DynamicObjectStoreCatalog::new( + // ctx.state().catalog_list().clone(), + // ctx.state_weak_ref(), + // ))); + + // let sql = "CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '/home/ubuntu/clickbench/hits.parquet' OPTIONS ('binary_as_string' 'true')"; + // let task_ctx = ctx.task_ctx(); + // let dialect = &task_ctx.session_config().options().sql_parser.dialect; + // let dialect = sqlparser::dialect::dialect_from_str(dialect).unwrap(); + // let plan = ctx.state().create_logical_plan(sql).await?; + // if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { + // let format = Some(ConfigFileType::PARQUET); + // // Clone and modify the default table options based on the provided options + // let mut table_options = ctx.state().default_table_options(); + // if let Some(format) = format { + // table_options.set_config_format(format); + // } + // table_options.alter_with_string_hash_map(&cmd.options)?; + + // ctx.sql(&sql).await?; + // for query_id in query_range { + // let sql = queries.get_query(query_id)?; + // let statements = DFParser::parse_sql_with_dialect(&sql, dialect.as_ref())?; + // for statement in statements { + // let plan = ctx.state().statement_to_plan(statement).await?; + // let df = ctx.execute_logical_plan(plan).await?; + // let physical_plan = df.create_physical_plan().await?; + // if physical_plan.boundedness().is_unbounded() { + // if physical_plan.pipeline_behavior() == EmissionType::Final { + // return plan_err!( + // "The given query can generate a valid result only once \ + // the source finishes, but the source is unbounded" + // ); + // } + // // As the input stream comes, we can generate results. + // // However, memory safety is not guaranteed. + // let start = Instant::now(); + // let _ = execute_stream(physical_plan, task_ctx.clone())?; + // let elapsed = start.elapsed().as_secs_f64(); + // println!("Query{query_id} took {elapsed} seconds"); + // } else { + // // Bounded stream; collected results are printed after all input consumed. + // let start = Instant::now(); + // let _ = collect(physical_plan, task_ctx.clone()).await?; + // let elapsed = start.elapsed().as_secs_f64(); + // println!("Q{query_id} took {elapsed} seconds"); + // } + // } + // } + // } else { + // return plan_err!("LogicalPlan is not a CreateExternalTable"); + // } + + // Ok(()) + // } + + pub async fn run_benchmark() { const TRIES: usize = 1; let mut query_num = 1; let runtime_config = RuntimeEnvBuilder::new() // Number of partitions for parallel processing @@ -774,14 +781,15 @@ pub async fn run_benchmark() { .with_repartition_file_scans(true) .with_round_robin_repartition(true) .with_repartition_sorts(true) - .with_batch_size(50000) - .with_target_partitions(8); + .with_batch_size(1000000) + .with_target_partitions(1); config.options_mut().execution.parquet.binary_as_string = true; config.options_mut().execution.use_row_number_estimates_to_optimize_partitioning = true; config.options_mut().execution.parquet.pushdown_filters = true; config.options_mut().execution.parquet.enable_page_index = true; config.options_mut().execution.parquet.pruning = true; config.options_mut().execution.parquet.reorder_filters = true; + config.options_mut().optimizer.enable_topk_aggregation = true; let state = SessionStateBuilder::new() .with_default_features() .with_config(config) @@ -805,11 +813,11 @@ pub async fn run_benchmark() { let physical_plan = df.create_physical_plan().await.unwrap(); // Add coalesce - let exec_plan: Arc = Arc::new(CoalesceBatchesExec::new(physical_plan, 8192)); + let exec_plan: Arc = Arc::new(CoalesceBatchesExec::new(physical_plan, 1000000)); let task_ctx = ctx.task_ctx(); let repartitioned = Arc::new(RepartitionExec::try_new( exec_plan, - Partitioning::RoundRobinBatch(8), + Partitioning::RoundRobinBatch(1), ).unwrap()); let start = Instant::now(); let _query_response = collect(repartitioned, task_ctx).await.unwrap(); @@ -824,149 +832,187 @@ pub async fn run_benchmark() { println!("Total time: {total_elapsed} seconds"); } - -// // Check if plan contains filter and add FilterExec -// fn has_filter(plan: &LogicalPlan) -> bool { -// println!("Plan: {plan}"); -// match plan { -// LogicalPlan::Filter(_) => true, -// LogicalPlan::Projection(proj) => has_filter(proj.input.as_ref()), -// LogicalPlan::Aggregate(agg) => has_filter(agg.input.as_ref()), -// LogicalPlan::Join(join) => { -// has_filter(join.left.as_ref()) || has_filter(join.right.as_ref()) -// }, -// LogicalPlan::Window(window) => has_filter(window.input.as_ref()), -// LogicalPlan::Sort(sort) => has_filter(sort.input.as_ref()), -// LogicalPlan::Limit(limit) => has_filter(limit.input.as_ref()), -// _ => false, -// } -// } - -// Extract filter expressions from logical plan -// fn extract_filters(plan: &LogicalPlan) -> Vec { -// match plan { -// LogicalPlan::Filter(filter) => vec![filter.predicate.clone()], -// LogicalPlan::Projection(proj) => extract_filters(proj.input.as_ref()), -// LogicalPlan::Aggregate(agg) => extract_filters(agg.input.as_ref()), -// LogicalPlan::Join(join) => { -// let mut filters = extract_filters(join.left.as_ref()); -// filters.extend(extract_filters(join.right.as_ref())); -// filters -// }, -// LogicalPlan::Limit(limit) => extract_filters(limit.input.as_ref()), -// LogicalPlan::Sort(sort) => extract_filters(sort.input.as_ref()), -// _ => vec![], -// } -// } - -// fn clear_caches() -> io::Result<()> { -// // Sync filesystems -// Command::new("sync").status()?; - -// // Clear caches using sudo -// Command::new("sudo") -// .args(&["tee", "/proc/sys/vm/drop_caches"]) -// .arg("3") -// .output()?; - -// Ok(()) -// } -pub mod error { - use crate::{metadata::error::stream_info::MetadataError, storage::ObjectStorageError}; - use datafusion::error::DataFusionError; - - #[derive(Debug, thiserror::Error)] - pub enum ExecuteError { - #[error("Query Execution failed due to error in object storage: {0}")] - ObjectStorage(#[from] ObjectStorageError), - #[error("Query Execution failed due to error in datafusion: {0}")] - Datafusion(#[from] DataFusionError), - #[error("Query Execution failed due to error in fetching metadata: {0}")] - Metadata(#[from] MetadataError), - } -} - -#[cfg(test)] -mod tests { - use serde_json::json; - - use crate::query::flatten_objects_for_count; - - #[test] - fn test_flat_simple() { - let val = vec![ - json!({ - "COUNT(*)": 1 - }), - json!({ - "COUNT(*)": 2 - }), - json!({ - "COUNT(*)": 3 - }), - ]; - - let out = flatten_objects_for_count(val); - assert_eq!(out, vec![json!({"COUNT(*)": 6})]); - } - - #[test] - fn test_flat_empty() { - let val = vec![]; - let out = flatten_objects_for_count(val.clone()); - assert_eq!(val, out); - } - - #[test] - fn test_flat_same_multi() { - let val = vec![json!({"COUNT(ALPHA)": 1}), json!({"COUNT(ALPHA)": 2})]; - let out = flatten_objects_for_count(val.clone()); - assert_eq!(vec![json!({"COUNT(ALPHA)": 3})], out); - } - - #[test] - fn test_flat_diff_multi() { - let val = vec![json!({"COUNT(ALPHA)": 1}), json!({"COUNT(BETA)": 2})]; - let out = flatten_objects_for_count(val.clone()); - assert_eq!(out, val); - } - - #[test] - fn test_flat_fail() { - let val = vec![ - json!({ - "Num": 1 - }), - json!({ - "Num": 2 - }), - json!({ - "Num": 3 - }), - ]; - - let out = flatten_objects_for_count(val.clone()); - assert_eq!(val, out); - } - - #[test] - fn test_flat_multi_key() { - let val = vec![ - json!({ - "Num": 1, - "COUNT(*)": 1 - }), - json!({ - "Num": 2, - "COUNT(*)": 2 - }), - json!({ - "Num": 3, - "COUNT(*)": 3 - }), - ]; - - let out = flatten_objects_for_count(val.clone()); - assert_eq!(val, out); - } -} + fn create_sort_plan( + logical_plan: &LogicalPlan, + exec_plan: Arc, + schema: &DFSchema, + state: &SessionState, + ) -> Result> { + // Extract sort expressions from the logical plan + let sort_exprs = match logical_plan { + LogicalPlan::Sort(sort) => { + // Get sort expressions from Sort node + sort.expr.clone() + } + _ => { + // No sorting specified in query, return original plan + return Ok(exec_plan); + } + }; + + // Convert logical sort expressions to physical sort expressions + let mut physical_sort_exprs = Vec::with_capacity(sort_exprs.len()); + + for sort_expr in sort_exprs { + let physical_expr = create_physical_expr( + &sort_expr.expr, + schema, + state.execution_props(), + )?; + + physical_sort_exprs.push(PhysicalSortExpr { + expr: physical_expr, + options: SortOptions::new(false, false) + }); + } + + + + // Create sort execution plan if we have sort expressions + if !physical_sort_exprs.is_empty() { + let ordering = LexOrdering::new(physical_sort_exprs); + let sort_preserving_merge_plan = Arc::new(SortPreservingMergeExec::new(ordering.clone(), exec_plan).with_fetch(Some(10))); + Ok(Arc::new(SortExec::new(ordering, sort_preserving_merge_plan).with_preserve_partitioning(true).with_fetch(Some(10)))) + } else { + Ok(exec_plan) + } + } + + fn create_filter_plan( + logical_plan: &LogicalPlan, + exec_plan: Arc, + state: &SessionState, + ) -> Result> { + // Extract sort expressions from the logical plan + match logical_plan { + LogicalPlan::Sort(sort) => { + // Get sort expressions from Sort node + create_filter_plan(&sort.input, exec_plan, state) + } + LogicalPlan::Filter(filter) => { + let schema = exec_plan.schema(); + let expr = filter.predicate.clone(); + let df_schema = DFSchema::try_from(Arc::new(schema.as_ref().clone())).unwrap(); + let physical_expr = create_physical_expr(&expr, &df_schema, state.execution_props()).unwrap(); + let filter_exec = FilterExec::try_new(physical_expr, exec_plan).unwrap(); + return Ok(Arc::new(filter_exec)); + } + LogicalPlan::Limit(limit) => { + create_filter_plan(&limit.input, exec_plan, state) + } + LogicalPlan::Projection(proj) => { + create_filter_plan(&proj.input, exec_plan, state) + } + LogicalPlan::Aggregate(agg) => { + create_filter_plan(&agg.input, exec_plan, state) + } + + _ => { + // No sorting specified in query, return original plan + return Ok(exec_plan); + } + } + + + } + + pub mod error { + use crate::{metadata::error::stream_info::MetadataError, storage::ObjectStorageError}; + use datafusion::error::DataFusionError; + + #[derive(Debug, thiserror::Error)] + pub enum ExecuteError { + #[error("Query Execution failed due to error in object storage: {0}")] + ObjectStorage(#[from] ObjectStorageError), + #[error("Query Execution failed due to error in datafusion: {0}")] + Datafusion(#[from] DataFusionError), + #[error("Query Execution failed due to error in fetching metadata: {0}")] + Metadata(#[from] MetadataError), + } + } + + #[cfg(test)] + mod tests { + use serde_json::json; + + use crate::query::flatten_objects_for_count; + + #[test] + fn test_flat_simple() { + let val = vec![ + json!({ + "COUNT(*)": 1 + }), + json!({ + "COUNT(*)": 2 + }), + json!({ + "COUNT(*)": 3 + }), + ]; + + let out = flatten_objects_for_count(val); + assert_eq!(out, vec![json!({"COUNT(*)": 6})]); + } + + #[test] + fn test_flat_empty() { + let val = vec![]; + let out = flatten_objects_for_count(val.clone()); + assert_eq!(val, out); + } + + #[test] + fn test_flat_same_multi() { + let val = vec![json!({"COUNT(ALPHA)": 1}), json!({"COUNT(ALPHA)": 2})]; + let out = flatten_objects_for_count(val.clone()); + assert_eq!(vec![json!({"COUNT(ALPHA)": 3})], out); + } + + #[test] + fn test_flat_diff_multi() { + let val = vec![json!({"COUNT(ALPHA)": 1}), json!({"COUNT(BETA)": 2})]; + let out = flatten_objects_for_count(val.clone()); + assert_eq!(out, val); + } + + #[test] + fn test_flat_fail() { + let val = vec![ + json!({ + "Num": 1 + }), + json!({ + "Num": 2 + }), + json!({ + "Num": 3 + }), + ]; + + let out = flatten_objects_for_count(val.clone()); + assert_eq!(val, out); + } + + #[test] + fn test_flat_multi_key() { + let val = vec![ + json!({ + "Num": 1, + "COUNT(*)": 1 + }), + json!({ + "Num": 2, + "COUNT(*)": 2 + }), + json!({ + "Num": 3, + "COUNT(*)": 3 + }), + ]; + + let out = flatten_objects_for_count(val.clone()); + assert_eq!(val, out); + } + } + \ No newline at end of file From b8d257e5a98d18ca1bf316e23424732a5bddaf8c Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Wed, 19 Feb 2025 02:08:47 -0500 Subject: [PATCH 23/32] optimised --- src/query/mod.rs | 1706 ++++++++++++++++++++-------------------------- 1 file changed, 742 insertions(+), 964 deletions(-) diff --git a/src/query/mod.rs b/src/query/mod.rs index d3d3ce4f4..332170343 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -16,1003 +16,781 @@ * */ - pub mod catalog; - mod filter_optimizer; - pub mod functions; - mod listing_table_builder; - pub mod object_storage; - pub mod stream_schema_provider; - - use arrow_schema::SortOptions; - // use catalog::DynamicObjectStoreCatalog; - use chrono::NaiveDateTime; - use chrono::{DateTime, Duration, Utc}; - use datafusion::arrow::record_batch::RecordBatch; - - use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeVisitor}; - use datafusion::common::DFSchema; - use datafusion::config::Extensions; -// use datafusion::common::{exec_datafusion_err, plan_err}; - // use datafusion::config::ConfigFileType; - use datafusion::error::{DataFusionError, Result}; - use datafusion::execution::disk_manager::DiskManagerConfig; - use datafusion::execution::runtime_env::RuntimeEnvBuilder; - // use datafusion::execution::runtime_env::RuntimeEnvBuilder; - use datafusion::execution::{SessionState, SessionStateBuilder}; - use datafusion::logical_expr::expr::Alias; - use datafusion::logical_expr::{ - Aggregate, Explain, Filter, LogicalPlan, PlanType, Projection, ToStringifiedPlan, - }; - use datafusion::physical_expr::{create_physical_expr, LexOrdering, PhysicalSortExpr}; - use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; - use datafusion::physical_plan::filter::FilterExec; - use datafusion::physical_plan::repartition::RepartitionExec; - use datafusion::physical_plan::sorts::sort::SortExec; - use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; - use datafusion::physical_plan::{collect, ExecutionPlan, Partitioning}; - // use datafusion::physical_plan::execution_plan::EmissionType; - // use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties}; - use datafusion::prelude::*; - // use datafusion::sql::parser::DFParser; - use itertools::Itertools; - use once_cell::sync::Lazy; - use relative_path::RelativePathBuf; - use serde::{Deserialize, Serialize}; - use serde_json::{json, Value}; - use std::ops::Bound; - // use std::path::{Path, PathBuf}; - use std::sync::Arc; - use std::time::Instant; - use stream_schema_provider::collect_manifest_files; - use sysinfo::System; - - use std::fs; - - use self::error::ExecuteError; - use self::stream_schema_provider::GlobalSchemaProvider; - pub use self::stream_schema_provider::PartialTimeFilter; - use crate::catalog::column::{Int64Type, TypedStatistics}; - use crate::catalog::manifest::Manifest; - use crate::catalog::snapshot::Snapshot; - use crate::catalog::Snapshot as CatalogSnapshot; - use crate::event; - use crate::handlers::http::query::QueryError; - use crate::metadata::STREAM_INFO; - use crate::option::{Mode, CONFIG}; - use crate::storage::{ObjectStorageProvider, ObjectStoreFormat, STREAM_ROOT_DIRECTORY}; - use crate::utils::time::TimeRange; - pub static QUERY_SESSION: Lazy = - Lazy::new(|| Query::create_session_context(CONFIG.storage())); - - // A query request by client - #[derive(Debug)] - pub struct Query { - pub raw_logical_plan: LogicalPlan, - pub time_range: TimeRange, - pub filter_tag: Option>, - } - - impl Query { - // create session context for this query - pub fn create_session_context(storage: Arc) -> SessionContext { - let runtime_config = storage - .get_datafusion_runtime() - .with_disk_manager(DiskManagerConfig::NewOs); - - let (pool_size, fraction) = match CONFIG.options.query_memory_pool_size { - Some(size) => (size, 1.), - None => { - let mut system = System::new(); - system.refresh_memory(); - let available_mem = system.available_memory(); - (available_mem as usize, 0.85) - } - }; - - let runtime_config = runtime_config.with_memory_limit(pool_size, fraction); - let runtime = Arc::new(runtime_config.build().unwrap()); - - let mut config = SessionConfig::default() - .with_parquet_pruning(true) - //.with_prefer_existing_sort(true) - .with_round_robin_repartition(true); - - // For more details refer https://datafusion.apache.org/user-guide/configs.html - - // Reduce the number of rows read (if possible) - //config.options_mut().execution.parquet.enable_page_index = true; - - // Pushdown filters allows DF to push the filters as far down in the plan as possible - // and thus, reducing the number of rows decoded - config.options_mut().execution.parquet.pushdown_filters = true; - - // Reorder filters allows DF to decide the order of filters minimizing the cost of filter evaluation - // config.options_mut().execution.parquet.reorder_filters = true; - - // Enable StringViewArray - // https://www.influxdata.com/blog/faster-queries-with-stringview-part-one-influxdb/ - // config - // .options_mut() - // .execution - // .parquet - // .schema_force_view_types = true; - config.options_mut().execution.parquet.binary_as_string = true; - - let state = SessionStateBuilder::new() - .with_default_features() - .with_config(config) - .with_runtime_env(runtime) - .build(); - - let schema_provider = Arc::new(GlobalSchemaProvider { - storage: storage.get_object_store(), - }); - state - .catalog_list() - .catalog(&state.config_options().catalog.default_catalog) - .expect("default catalog is provided by datafusion") - .register_schema( - &state.config_options().catalog.default_schema, - schema_provider, - ) - .unwrap(); - - SessionContext::new_with_state(state) - } - - pub async fn execute( - &self, - stream_name: String, - ) -> Result<(Vec, Vec), ExecuteError> { - let time_partition = STREAM_INFO.get_time_partition(&stream_name)?; - - let df = QUERY_SESSION - .execute_logical_plan(self.final_logical_plan(&time_partition)) - .await?; - - let fields = df - .schema() - .fields() - .iter() - .map(|f| f.name()) - .cloned() - .collect_vec(); - - if fields.is_empty() { - return Ok((vec![], fields)); - } - - let results = df.collect().await?; - Ok((results, fields)) - } - - pub async fn get_dataframe(&self, stream_name: String) -> Result { - let time_partition = STREAM_INFO.get_time_partition(&stream_name)?; - - let df = QUERY_SESSION - .execute_logical_plan(self.final_logical_plan(&time_partition)) - .await?; - - Ok(df) - } - - /// return logical plan with all time filters applied through - fn final_logical_plan(&self, time_partition: &Option) -> LogicalPlan { - // see https://github.com/apache/arrow-datafusion/pull/8400 - // this can be eliminated in later version of datafusion but with slight caveat - // transform cannot modify stringified plans by itself - // we by knowing this plan is not in the optimization procees chose to overwrite the stringified plan - - match self.raw_logical_plan.clone() { - LogicalPlan::Explain(plan) => { - let transformed = transform( - plan.plan.as_ref().clone(), - self.time_range.start.naive_utc(), - self.time_range.end.naive_utc(), - time_partition, - ); - LogicalPlan::Explain(Explain { - verbose: plan.verbose, - stringified_plans: vec![transformed - .data - .to_stringified(PlanType::InitialLogicalPlan)], - plan: Arc::new(transformed.data), - schema: plan.schema, - logical_optimization_succeeded: plan.logical_optimization_succeeded, - }) - } - x => { - transform( - x, - self.time_range.start.naive_utc(), - self.time_range.end.naive_utc(), - time_partition, - ) - .data - } - } - } - - pub fn first_table_name(&self) -> Option { - let mut visitor = TableScanVisitor::default(); - let _ = self.raw_logical_plan.visit(&mut visitor); - visitor.into_inner().pop() - } - - /// Evaluates to Some("count(*)") | Some("column_name") if the logical plan is a Projection: SELECT COUNT(*) | SELECT COUNT(*) as column_name - pub fn is_logical_plan_count_without_filters(&self) -> Option<&String> { - // Check if the raw logical plan is a Projection: SELECT - let LogicalPlan::Projection(Projection { input, expr, .. }) = &self.raw_logical_plan else { - return None; - }; - // Check if the input of the Projection is an Aggregate: COUNT(*) - let LogicalPlan::Aggregate(Aggregate { input, .. }) = &**input else { - return None; - }; - - // Ensure the input of the Aggregate is a TableScan and there is exactly one expression: SELECT COUNT(*) - if !matches!(&**input, LogicalPlan::TableScan { .. }) || expr.len() != 1 { - return None; - } - - // Check if the expression is a column or an alias for COUNT(*) - match &expr[0] { - // Direct column check - Expr::Column(Column { name, .. }) if name.to_lowercase() == "count(*)" => Some(name), - // Alias for COUNT(*) - Expr::Alias(Alias { - expr: inner_expr, - name: alias_name, - .. - }) => { - if let Expr::Column(Column { name, .. }) = &**inner_expr { - if name.to_lowercase() == "count(*)" { - return Some(alias_name); - } - } - None - } - // Unsupported expression type - _ => None, - } - } - } - - /// Record of counts for a given time bin. - #[derive(Debug, Serialize, Clone)] - pub struct CountsRecord { - /// Start time of the bin - pub start_time: String, - /// End time of the bin - pub end_time: String, - /// Number of logs in the bin - pub count: u64, - } - - struct TimeBounds { - start: DateTime, - end: DateTime, - } - - /// Request for counts, received from API/SQL query. - #[derive(Debug, Deserialize, Clone)] - #[serde(rename_all = "camelCase")] - pub struct CountsRequest { - /// Name of the stream to get counts for - pub stream: String, - /// Included start time for counts query - pub start_time: String, - /// Excluded end time for counts query - pub end_time: String, - /// Number of bins to divide the time range into - pub num_bins: u64, - } - - impl CountsRequest { - /// This function is supposed to read maninfest files for the given stream, - /// get the sum of `num_rows` between the `startTime` and `endTime`, - /// divide that by number of bins and return in a manner acceptable for the console - pub async fn get_bin_density(&self) -> Result, QueryError> { - let time_partition = STREAM_INFO - .get_time_partition(&self.stream.clone()) - .map_err(|err| anyhow::Error::msg(err.to_string()))? - .unwrap_or_else(|| event::DEFAULT_TIMESTAMP_KEY.to_owned()); - - // get time range - let time_range = TimeRange::parse_human_time(&self.start_time, &self.end_time)?; - let all_manifest_files = get_manifest_list(&self.stream, &time_range).await?; - // get bounds - let counts = self.get_bounds(&time_range); - - // we have start and end times for each bin - // we also have all the manifest files for the given time range - // now we iterate over start and end times for each bin - // then we iterate over the manifest files which are within that time range - // we sum up the num_rows - let mut counts_records = Vec::new(); - - for bin in counts { - // extract start and end time to compare - // Sum up the number of rows that fall within the bin - let count: u64 = all_manifest_files - .iter() - .flat_map(|m| &m.files) - .filter_map(|f| { - if f.columns.iter().any(|c| { - c.name == time_partition - && c.stats.as_ref().is_some_and(|stats| match stats { - TypedStatistics::Int(Int64Type { min, .. }) => { - let min = DateTime::from_timestamp_millis(*min).unwrap(); - bin.start <= min && bin.end >= min // Determines if a column matches the bin's time range. - } - _ => false, - }) - }) { - Some(f.num_rows) - } else { - None - } - }) - .sum(); - - counts_records.push(CountsRecord { - start_time: bin.start.to_rfc3339(), - end_time: bin.end.to_rfc3339(), - count, - }); - } - Ok(counts_records) - } - - /// Calculate the end time for each bin based on the number of bins - fn get_bounds(&self, time_range: &TimeRange) -> Vec { - let total_minutes = time_range - .end - .signed_duration_since(time_range.start) - .num_minutes() as u64; - - // divide minutes by num bins to get minutes per bin - let quotient = total_minutes / self.num_bins; - let remainder = total_minutes % self.num_bins; - let have_remainder = remainder > 0; - - // now create multiple bounds [startTime, endTime) - // Should we exclude the last one??? - let mut bounds = vec![]; - - let mut start = time_range.start; - - let loop_end = if have_remainder { - self.num_bins - } else { - self.num_bins - 1 - }; - - // Create bins for all but the last date - for _ in 0..loop_end { - let end = start + Duration::minutes(quotient as i64); - bounds.push(TimeBounds { start, end }); - start = end; - } - - // Add the last bin, accounting for any remainder, should we include it? - if have_remainder { - bounds.push(TimeBounds { - start, - end: start + Duration::minutes(remainder as i64), - }); - } else { - bounds.push(TimeBounds { - start, - end: start + Duration::minutes(quotient as i64), - }); - } - - bounds - } - } - - /// Response for the counts API - #[derive(Debug, Serialize, Clone)] - pub struct CountsResponse { - /// Fields in the log stream - pub fields: Vec, - /// Records in the response - pub records: Vec, - } - - #[derive(Debug, Default)] - pub struct TableScanVisitor { - tables: Vec, - } - - impl TableScanVisitor { - pub fn into_inner(self) -> Vec { - self.tables - } - } - - impl TreeNodeVisitor<'_> for TableScanVisitor { - type Node = LogicalPlan; - - fn f_down(&mut self, node: &Self::Node) -> Result { - match node { - LogicalPlan::TableScan(table) => { - self.tables.push(table.table_name.table().to_string()); - Ok(TreeNodeRecursion::Jump) - } - _ => Ok(TreeNodeRecursion::Continue), - } - } - } - - pub async fn get_manifest_list( - stream_name: &str, - time_range: &TimeRange, - ) -> Result, QueryError> { - let glob_storage = CONFIG.storage().get_object_store(); - - let object_store = QUERY_SESSION - .state() - .runtime_env() - .object_store_registry - .get_store(&glob_storage.store_url()) - .unwrap(); - - // get object store - let object_store_format = glob_storage - .get_object_store_format(stream_name) - .await - .map_err(|err| DataFusionError::Plan(err.to_string()))?; - - // all the manifests will go here - let mut merged_snapshot: Snapshot = Snapshot::default(); - - // get a list of manifests - if CONFIG.options.mode == Mode::Query { - let path = RelativePathBuf::from_iter([stream_name, STREAM_ROOT_DIRECTORY]); - let obs = glob_storage - .get_objects( - Some(&path), - Box::new(|file_name| file_name.ends_with("stream.json")), - ) - .await; - if let Ok(obs) = obs { - for ob in obs { - if let Ok(object_store_format) = serde_json::from_slice::(&ob) { - let snapshot = object_store_format.snapshot; - for manifest in snapshot.manifest_list { - merged_snapshot.manifest_list.push(manifest); - } - } - } - } - } else { - merged_snapshot = object_store_format.snapshot; - } - - // Download all the manifest files - let time_filter = [ - PartialTimeFilter::Low(Bound::Included(time_range.start.naive_utc())), - PartialTimeFilter::High(Bound::Included(time_range.end.naive_utc())), - ]; - - let all_manifest_files = collect_manifest_files( - object_store, - merged_snapshot - .manifests(&time_filter) - .into_iter() - .sorted_by_key(|file| file.time_lower_bound) - .map(|item| item.manifest_path) - .collect(), - ) - .await - .map_err(|err| anyhow::Error::msg(err.to_string()))?; - - Ok(all_manifest_files) - } - - fn transform( - plan: LogicalPlan, - start_time: NaiveDateTime, - end_time: NaiveDateTime, - time_partition: &Option, - ) -> Transformed { - plan.transform(&|plan| match plan { - LogicalPlan::TableScan(table) => { - let new_filters = vec![]; - if !table_contains_any_time_filters(&table, time_partition) { - let mut _start_time_filter: Expr; - let mut _end_time_filter: Expr; - match time_partition { - Some(time_partition) => { - _start_time_filter = - PartialTimeFilter::Low(std::ops::Bound::Included(start_time)) - .binary_expr(Expr::Column(Column::new( - Some(table.table_name.to_owned()), - time_partition.clone(), - ))); - _end_time_filter = - PartialTimeFilter::High(std::ops::Bound::Excluded(end_time)) - .binary_expr(Expr::Column(Column::new( - Some(table.table_name.to_owned()), - time_partition, - ))); - } - None => { - _start_time_filter = - PartialTimeFilter::Low(std::ops::Bound::Included(start_time)) - .binary_expr(Expr::Column(Column::new( - Some(table.table_name.to_owned()), - event::DEFAULT_TIMESTAMP_KEY, - ))); - _end_time_filter = - PartialTimeFilter::High(std::ops::Bound::Excluded(end_time)) - .binary_expr(Expr::Column(Column::new( - Some(table.table_name.to_owned()), - event::DEFAULT_TIMESTAMP_KEY, - ))); - } - } - - //new_filters.push(_start_time_filter); - //new_filters.push(_end_time_filter); - } - let new_filter = new_filters.into_iter().reduce(and); - if let Some(new_filter) = new_filter { - let filter = - Filter::try_new(new_filter, Arc::new(LogicalPlan::TableScan(table))).unwrap(); - Ok(Transformed::yes(LogicalPlan::Filter(filter))) - } else { - Ok(Transformed::no(LogicalPlan::TableScan(table))) - } - } - x => Ok(Transformed::no(x)), - }) - .expect("transform only transforms the tablescan") - } - - fn table_contains_any_time_filters( - table: &datafusion::logical_expr::TableScan, - time_partition: &Option, - ) -> bool { - table - .filters - .iter() - .filter_map(|x| { - if let Expr::BinaryExpr(binexpr) = x { - Some(binexpr) - } else { - None - } - }) - .any(|expr| { - matches!(&*expr.left, Expr::Column(Column { name, .. }) +pub mod catalog; +mod filter_optimizer; +pub mod functions; +mod listing_table_builder; +pub mod object_storage; +pub mod stream_schema_provider; + +use chrono::NaiveDateTime; +use chrono::{DateTime, Duration, Utc}; +use datafusion::arrow::record_batch::RecordBatch; + +use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeVisitor}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::execution::disk_manager::DiskManagerConfig; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::execution::SessionStateBuilder; +use datafusion::logical_expr::expr::Alias; +use datafusion::logical_expr::{ + Aggregate, Explain, Filter, LogicalPlan, PlanType, Projection, ToStringifiedPlan, +}; +use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; +use datafusion::physical_plan::repartition::RepartitionExec; +use datafusion::physical_plan::{collect, ExecutionPlan, Partitioning}; +use datafusion::prelude::*; +use itertools::Itertools; +use once_cell::sync::Lazy; +use relative_path::RelativePathBuf; +use serde::{Deserialize, Serialize}; +use serde_json::{json, Value}; +use std::ops::Bound; +use std::sync::Arc; +use std::time::Instant; +use stream_schema_provider::collect_manifest_files; +use sysinfo::System; + +use std::{env, fs}; + +use self::error::ExecuteError; +use self::stream_schema_provider::GlobalSchemaProvider; +pub use self::stream_schema_provider::PartialTimeFilter; +use crate::catalog::column::{Int64Type, TypedStatistics}; +use crate::catalog::manifest::Manifest; +use crate::catalog::snapshot::Snapshot; +use crate::catalog::Snapshot as CatalogSnapshot; +use crate::event; +use crate::handlers::http::query::QueryError; +use crate::metadata::STREAM_INFO; +use crate::option::{Mode, CONFIG}; +use crate::storage::{ObjectStorageProvider, ObjectStoreFormat, STREAM_ROOT_DIRECTORY}; +use crate::utils::time::TimeRange; +pub static QUERY_SESSION: Lazy = + Lazy::new(|| Query::create_session_context(CONFIG.storage())); + +// A query request by client +#[derive(Debug)] +pub struct Query { + pub raw_logical_plan: LogicalPlan, + pub time_range: TimeRange, + pub filter_tag: Option>, +} + +impl Query { + // create session context for this query + pub fn create_session_context(storage: Arc) -> SessionContext { + let runtime_config = storage + .get_datafusion_runtime() + .with_disk_manager(DiskManagerConfig::NewOs); + + let (pool_size, fraction) = match CONFIG.options.query_memory_pool_size { + Some(size) => (size, 1.), + None => { + let mut system = System::new(); + system.refresh_memory(); + let available_mem = system.available_memory(); + (available_mem as usize, 0.85) + } + }; + + let runtime_config = runtime_config.with_memory_limit(pool_size, fraction); + let runtime = Arc::new(runtime_config.build().unwrap()); + + let mut config = SessionConfig::default() + .with_parquet_pruning(true) + //.with_prefer_existing_sort(true) + .with_round_robin_repartition(true); + + // For more details refer https://datafusion.apache.org/user-guide/configs.html + + // Reduce the number of rows read (if possible) + //config.options_mut().execution.parquet.enable_page_index = true; + + // Pushdown filters allows DF to push the filters as far down in the plan as possible + // and thus, reducing the number of rows decoded + config.options_mut().execution.parquet.pushdown_filters = true; + + // Reorder filters allows DF to decide the order of filters minimizing the cost of filter evaluation + // config.options_mut().execution.parquet.reorder_filters = true; + + // Enable StringViewArray + // https://www.influxdata.com/blog/faster-queries-with-stringview-part-one-influxdb/ + // config + // .options_mut() + // .execution + // .parquet + // .schema_force_view_types = true; + config.options_mut().execution.parquet.binary_as_string = true; + + let state = SessionStateBuilder::new() + .with_default_features() + .with_config(config) + .with_runtime_env(runtime) + .build(); + + let schema_provider = Arc::new(GlobalSchemaProvider { + storage: storage.get_object_store(), + }); + state + .catalog_list() + .catalog(&state.config_options().catalog.default_catalog) + .expect("default catalog is provided by datafusion") + .register_schema( + &state.config_options().catalog.default_schema, + schema_provider, + ) + .unwrap(); + + SessionContext::new_with_state(state) + } + + pub async fn execute( + &self, + stream_name: String, + ) -> Result<(Vec, Vec), ExecuteError> { + let time_partition = STREAM_INFO.get_time_partition(&stream_name)?; + + let df = QUERY_SESSION + .execute_logical_plan(self.final_logical_plan(&time_partition)) + .await?; + + let fields = df + .schema() + .fields() + .iter() + .map(|f| f.name()) + .cloned() + .collect_vec(); + + if fields.is_empty() { + return Ok((vec![], fields)); + } + + let results = df.collect().await?; + Ok((results, fields)) + } + + pub async fn get_dataframe(&self, stream_name: String) -> Result { + let time_partition = STREAM_INFO.get_time_partition(&stream_name)?; + + let df = QUERY_SESSION + .execute_logical_plan(self.final_logical_plan(&time_partition)) + .await?; + + Ok(df) + } + + /// return logical plan with all time filters applied through + fn final_logical_plan(&self, time_partition: &Option) -> LogicalPlan { + // see https://github.com/apache/arrow-datafusion/pull/8400 + // this can be eliminated in later version of datafusion but with slight caveat + // transform cannot modify stringified plans by itself + // we by knowing this plan is not in the optimization procees chose to overwrite the stringified plan + + match self.raw_logical_plan.clone() { + LogicalPlan::Explain(plan) => { + let transformed = transform( + plan.plan.as_ref().clone(), + self.time_range.start.naive_utc(), + self.time_range.end.naive_utc(), + time_partition, + ); + LogicalPlan::Explain(Explain { + verbose: plan.verbose, + stringified_plans: vec![transformed + .data + .to_stringified(PlanType::InitialLogicalPlan)], + plan: Arc::new(transformed.data), + schema: plan.schema, + logical_optimization_succeeded: plan.logical_optimization_succeeded, + }) + } + x => { + transform( + x, + self.time_range.start.naive_utc(), + self.time_range.end.naive_utc(), + time_partition, + ) + .data + } + } + } + + pub fn first_table_name(&self) -> Option { + let mut visitor = TableScanVisitor::default(); + let _ = self.raw_logical_plan.visit(&mut visitor); + visitor.into_inner().pop() + } + + /// Evaluates to Some("count(*)") | Some("column_name") if the logical plan is a Projection: SELECT COUNT(*) | SELECT COUNT(*) as column_name + pub fn is_logical_plan_count_without_filters(&self) -> Option<&String> { + // Check if the raw logical plan is a Projection: SELECT + let LogicalPlan::Projection(Projection { input, expr, .. }) = &self.raw_logical_plan else { + return None; + }; + // Check if the input of the Projection is an Aggregate: COUNT(*) + let LogicalPlan::Aggregate(Aggregate { input, .. }) = &**input else { + return None; + }; + + // Ensure the input of the Aggregate is a TableScan and there is exactly one expression: SELECT COUNT(*) + if !matches!(&**input, LogicalPlan::TableScan { .. }) || expr.len() != 1 { + return None; + } + + // Check if the expression is a column or an alias for COUNT(*) + match &expr[0] { + // Direct column check + Expr::Column(Column { name, .. }) if name.to_lowercase() == "count(*)" => Some(name), + // Alias for COUNT(*) + Expr::Alias(Alias { + expr: inner_expr, + name: alias_name, + .. + }) => { + if let Expr::Column(Column { name, .. }) = &**inner_expr { + if name.to_lowercase() == "count(*)" { + return Some(alias_name); + } + } + None + } + // Unsupported expression type + _ => None, + } + } +} + +/// Record of counts for a given time bin. +#[derive(Debug, Serialize, Clone)] +pub struct CountsRecord { + /// Start time of the bin + pub start_time: String, + /// End time of the bin + pub end_time: String, + /// Number of logs in the bin + pub count: u64, +} + +struct TimeBounds { + start: DateTime, + end: DateTime, +} + +/// Request for counts, received from API/SQL query. +#[derive(Debug, Deserialize, Clone)] +#[serde(rename_all = "camelCase")] +pub struct CountsRequest { + /// Name of the stream to get counts for + pub stream: String, + /// Included start time for counts query + pub start_time: String, + /// Excluded end time for counts query + pub end_time: String, + /// Number of bins to divide the time range into + pub num_bins: u64, +} + +impl CountsRequest { + /// This function is supposed to read maninfest files for the given stream, + /// get the sum of `num_rows` between the `startTime` and `endTime`, + /// divide that by number of bins and return in a manner acceptable for the console + pub async fn get_bin_density(&self) -> Result, QueryError> { + let time_partition = STREAM_INFO + .get_time_partition(&self.stream.clone()) + .map_err(|err| anyhow::Error::msg(err.to_string()))? + .unwrap_or_else(|| event::DEFAULT_TIMESTAMP_KEY.to_owned()); + + // get time range + let time_range = TimeRange::parse_human_time(&self.start_time, &self.end_time)?; + let all_manifest_files = get_manifest_list(&self.stream, &time_range).await?; + // get bounds + let counts = self.get_bounds(&time_range); + + // we have start and end times for each bin + // we also have all the manifest files for the given time range + // now we iterate over start and end times for each bin + // then we iterate over the manifest files which are within that time range + // we sum up the num_rows + let mut counts_records = Vec::new(); + + for bin in counts { + // extract start and end time to compare + // Sum up the number of rows that fall within the bin + let count: u64 = all_manifest_files + .iter() + .flat_map(|m| &m.files) + .filter_map(|f| { + if f.columns.iter().any(|c| { + c.name == time_partition + && c.stats.as_ref().is_some_and(|stats| match stats { + TypedStatistics::Int(Int64Type { min, .. }) => { + let min = DateTime::from_timestamp_millis(*min).unwrap(); + bin.start <= min && bin.end >= min // Determines if a column matches the bin's time range. + } + _ => false, + }) + }) { + Some(f.num_rows) + } else { + None + } + }) + .sum(); + + counts_records.push(CountsRecord { + start_time: bin.start.to_rfc3339(), + end_time: bin.end.to_rfc3339(), + count, + }); + } + Ok(counts_records) + } + + /// Calculate the end time for each bin based on the number of bins + fn get_bounds(&self, time_range: &TimeRange) -> Vec { + let total_minutes = time_range + .end + .signed_duration_since(time_range.start) + .num_minutes() as u64; + + // divide minutes by num bins to get minutes per bin + let quotient = total_minutes / self.num_bins; + let remainder = total_minutes % self.num_bins; + let have_remainder = remainder > 0; + + // now create multiple bounds [startTime, endTime) + // Should we exclude the last one??? + let mut bounds = vec![]; + + let mut start = time_range.start; + + let loop_end = if have_remainder { + self.num_bins + } else { + self.num_bins - 1 + }; + + // Create bins for all but the last date + for _ in 0..loop_end { + let end = start + Duration::minutes(quotient as i64); + bounds.push(TimeBounds { start, end }); + start = end; + } + + // Add the last bin, accounting for any remainder, should we include it? + if have_remainder { + bounds.push(TimeBounds { + start, + end: start + Duration::minutes(remainder as i64), + }); + } else { + bounds.push(TimeBounds { + start, + end: start + Duration::minutes(quotient as i64), + }); + } + + bounds + } +} + +/// Response for the counts API +#[derive(Debug, Serialize, Clone)] +pub struct CountsResponse { + /// Fields in the log stream + pub fields: Vec, + /// Records in the response + pub records: Vec, +} + +#[derive(Debug, Default)] +pub struct TableScanVisitor { + tables: Vec, +} + +impl TableScanVisitor { + pub fn into_inner(self) -> Vec { + self.tables + } +} + +impl TreeNodeVisitor<'_> for TableScanVisitor { + type Node = LogicalPlan; + + fn f_down(&mut self, node: &Self::Node) -> Result { + match node { + LogicalPlan::TableScan(table) => { + self.tables.push(table.table_name.table().to_string()); + Ok(TreeNodeRecursion::Jump) + } + _ => Ok(TreeNodeRecursion::Continue), + } + } +} + +pub async fn get_manifest_list( + stream_name: &str, + time_range: &TimeRange, +) -> Result, QueryError> { + let glob_storage = CONFIG.storage().get_object_store(); + + let object_store = QUERY_SESSION + .state() + .runtime_env() + .object_store_registry + .get_store(&glob_storage.store_url()) + .unwrap(); + + // get object store + let object_store_format = glob_storage + .get_object_store_format(stream_name) + .await + .map_err(|err| DataFusionError::Plan(err.to_string()))?; + + // all the manifests will go here + let mut merged_snapshot: Snapshot = Snapshot::default(); + + // get a list of manifests + if CONFIG.options.mode == Mode::Query { + let path = RelativePathBuf::from_iter([stream_name, STREAM_ROOT_DIRECTORY]); + let obs = glob_storage + .get_objects( + Some(&path), + Box::new(|file_name| file_name.ends_with("stream.json")), + ) + .await; + if let Ok(obs) = obs { + for ob in obs { + if let Ok(object_store_format) = serde_json::from_slice::(&ob) { + let snapshot = object_store_format.snapshot; + for manifest in snapshot.manifest_list { + merged_snapshot.manifest_list.push(manifest); + } + } + } + } + } else { + merged_snapshot = object_store_format.snapshot; + } + + // Download all the manifest files + let time_filter = [ + PartialTimeFilter::Low(Bound::Included(time_range.start.naive_utc())), + PartialTimeFilter::High(Bound::Included(time_range.end.naive_utc())), + ]; + + let all_manifest_files = collect_manifest_files( + object_store, + merged_snapshot + .manifests(&time_filter) + .into_iter() + .sorted_by_key(|file| file.time_lower_bound) + .map(|item| item.manifest_path) + .collect(), + ) + .await + .map_err(|err| anyhow::Error::msg(err.to_string()))?; + + Ok(all_manifest_files) +} + +fn transform( + plan: LogicalPlan, + start_time: NaiveDateTime, + end_time: NaiveDateTime, + time_partition: &Option, +) -> Transformed { + plan.transform(&|plan| match plan { + LogicalPlan::TableScan(table) => { + let new_filters = vec![]; + if !table_contains_any_time_filters(&table, time_partition) { + let mut _start_time_filter: Expr; + let mut _end_time_filter: Expr; + match time_partition { + Some(time_partition) => { + _start_time_filter = + PartialTimeFilter::Low(std::ops::Bound::Included(start_time)) + .binary_expr(Expr::Column(Column::new( + Some(table.table_name.to_owned()), + time_partition.clone(), + ))); + _end_time_filter = + PartialTimeFilter::High(std::ops::Bound::Excluded(end_time)) + .binary_expr(Expr::Column(Column::new( + Some(table.table_name.to_owned()), + time_partition, + ))); + } + None => { + _start_time_filter = + PartialTimeFilter::Low(std::ops::Bound::Included(start_time)) + .binary_expr(Expr::Column(Column::new( + Some(table.table_name.to_owned()), + event::DEFAULT_TIMESTAMP_KEY, + ))); + _end_time_filter = + PartialTimeFilter::High(std::ops::Bound::Excluded(end_time)) + .binary_expr(Expr::Column(Column::new( + Some(table.table_name.to_owned()), + event::DEFAULT_TIMESTAMP_KEY, + ))); + } + } + + //new_filters.push(_start_time_filter); + //new_filters.push(_end_time_filter); + } + let new_filter = new_filters.into_iter().reduce(and); + if let Some(new_filter) = new_filter { + let filter = + Filter::try_new(new_filter, Arc::new(LogicalPlan::TableScan(table))).unwrap(); + Ok(Transformed::yes(LogicalPlan::Filter(filter))) + } else { + Ok(Transformed::no(LogicalPlan::TableScan(table))) + } + } + x => Ok(Transformed::no(x)), + }) + .expect("transform only transforms the tablescan") +} + +fn table_contains_any_time_filters( + table: &datafusion::logical_expr::TableScan, + time_partition: &Option, +) -> bool { + table + .filters + .iter() + .filter_map(|x| { + if let Expr::BinaryExpr(binexpr) = x { + Some(binexpr) + } else { + None + } + }) + .any(|expr| { + matches!(&*expr.left, Expr::Column(Column { name, .. }) if ((time_partition.is_some() && name == time_partition.as_ref().unwrap()) || (!time_partition.is_some() && name == event::DEFAULT_TIMESTAMP_KEY))) - }) - } - - /// unused for now might need it later - #[allow(unused)] - pub fn flatten_objects_for_count(objects: Vec) -> Vec { - if objects.is_empty() { - return objects; - } - - // check if all the keys start with "COUNT" - let flag = objects.iter().all(|obj| { - obj.as_object() - .unwrap() - .keys() - .all(|key| key.starts_with("COUNT")) - }) && objects.iter().all(|obj| { - obj.as_object() - .unwrap() - .keys() - .all(|key| key == objects[0].as_object().unwrap().keys().next().unwrap()) - }); - - if flag { - let mut accum = 0u64; - let key = objects[0] - .as_object() - .unwrap() - .keys() - .next() - .unwrap() - .clone(); - - for obj in objects { - let count = obj.as_object().unwrap().keys().fold(0, |acc, key| { - let value = obj.as_object().unwrap().get(key).unwrap().as_u64().unwrap(); - acc + value - }); - accum += count; - } - - vec![json!({ - key: accum - })] - } else { - objects - } - } - - // struct AllQueries { - // queries: Vec, - // } - - // impl AllQueries { - // fn try_new(path: &Path) -> Result { - // // ClickBench has all queries in a single file identified by line number - // let all_queries = std::fs::read_to_string(path) - // .map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?; - // Ok(Self { - // queries: all_queries.lines().map(|s| s.to_string()).collect(), - // }) - // } - - // /// Returns the text of query `query_id` - // fn get_query(&self, query_id: usize) -> Result<&str> { - // self.queries - // .get(query_id) - // .ok_or_else(|| { - // let min_id = self.min_query_id(); - // let max_id = self.max_query_id(); - // exec_datafusion_err!( - // "Invalid query id {query_id}. Must be between {min_id} and {max_id}" - // ) - // }) - // .map(|s| s.as_str()) - // } - - // fn min_query_id(&self) -> usize { - // 0 - // } - - // fn max_query_id(&self) -> usize { - // self.queries.len() - 1 - // } - // } - - // pub async fn run() -> Result<()> { - // let rt_config = RuntimeEnvBuilder::new(); - // let runtime_env = rt_config.build().unwrap(); - // println!("Running benchmarks"); - // let queries_path: PathBuf = ["/home", "ubuntu", "queries.sql"] - // .iter() - // .collect(); - // let queries = AllQueries::try_new(queries_path.as_path())?; - // println!("queries loaded"); - // let query_range = queries.min_query_id()..=queries.max_query_id(); - - // // configure parquet options - // let mut config = SessionConfig::new() - // .with_parquet_pruning(true) - // .with_target_partitions(num_cpus::get()) - // .with_coalesce_batches(true) - // .with_collect_statistics(true) - // .with_parquet_page_index_pruning(true); - // config.options_mut().execution.parquet.binary_as_string = true; - // config.options_mut().execution.parquet.pushdown_filters = true; - // config.options_mut().execution.parquet.reorder_filters = true; - // config - // .options_mut() - // .execution - // .use_row_number_estimates_to_optimize_partitioning = true; - // // enable dynamic file query - // let ctx = SessionContext::new_with_config_rt(config, Arc::new(runtime_env)).enable_url_table(); - // ctx.refresh_catalogs().await?; - // // install dynamic catalog provider that can register required object stores - // ctx.register_catalog_list(Arc::new(DynamicObjectStoreCatalog::new( - // ctx.state().catalog_list().clone(), - // ctx.state_weak_ref(), - // ))); - - // let sql = "CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '/home/ubuntu/clickbench/hits.parquet' OPTIONS ('binary_as_string' 'true')"; - // let task_ctx = ctx.task_ctx(); - // let dialect = &task_ctx.session_config().options().sql_parser.dialect; - // let dialect = sqlparser::dialect::dialect_from_str(dialect).unwrap(); - // let plan = ctx.state().create_logical_plan(sql).await?; - // if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { - // let format = Some(ConfigFileType::PARQUET); - // // Clone and modify the default table options based on the provided options - // let mut table_options = ctx.state().default_table_options(); - // if let Some(format) = format { - // table_options.set_config_format(format); - // } - // table_options.alter_with_string_hash_map(&cmd.options)?; - - // ctx.sql(&sql).await?; - // for query_id in query_range { - // let sql = queries.get_query(query_id)?; - // let statements = DFParser::parse_sql_with_dialect(&sql, dialect.as_ref())?; - // for statement in statements { - // let plan = ctx.state().statement_to_plan(statement).await?; - // let df = ctx.execute_logical_plan(plan).await?; - // let physical_plan = df.create_physical_plan().await?; - // if physical_plan.boundedness().is_unbounded() { - // if physical_plan.pipeline_behavior() == EmissionType::Final { - // return plan_err!( - // "The given query can generate a valid result only once \ - // the source finishes, but the source is unbounded" - // ); - // } - // // As the input stream comes, we can generate results. - // // However, memory safety is not guaranteed. - // let start = Instant::now(); - // let _ = execute_stream(physical_plan, task_ctx.clone())?; - // let elapsed = start.elapsed().as_secs_f64(); - // println!("Query{query_id} took {elapsed} seconds"); - // } else { - // // Bounded stream; collected results are printed after all input consumed. - // let start = Instant::now(); - // let _ = collect(physical_plan, task_ctx.clone()).await?; - // let elapsed = start.elapsed().as_secs_f64(); - // println!("Q{query_id} took {elapsed} seconds"); - // } - // } - // } - // } else { - // return plan_err!("LogicalPlan is not a CreateExternalTable"); - // } - - // Ok(()) - // } - - pub async fn run_benchmark() { + }) +} + +/// unused for now might need it later +#[allow(unused)] +pub fn flatten_objects_for_count(objects: Vec) -> Vec { + if objects.is_empty() { + return objects; + } + + // check if all the keys start with "COUNT" + let flag = objects.iter().all(|obj| { + obj.as_object() + .unwrap() + .keys() + .all(|key| key.starts_with("COUNT")) + }) && objects.iter().all(|obj| { + obj.as_object() + .unwrap() + .keys() + .all(|key| key == objects[0].as_object().unwrap().keys().next().unwrap()) + }); + + if flag { + let mut accum = 0u64; + let key = objects[0] + .as_object() + .unwrap() + .keys() + .next() + .unwrap() + .clone(); + + for obj in objects { + let count = obj.as_object().unwrap().keys().fold(0, |acc, key| { + let value = obj.as_object().unwrap().get(key).unwrap().as_u64().unwrap(); + acc + value + }); + accum += count; + } + + vec![json!({ + key: accum + })] + } else { + objects + } +} + +pub async fn run_benchmark() { const TRIES: usize = 1; let mut query_num = 1; - let runtime_config = RuntimeEnvBuilder::new() // Number of partitions for parallel processing - .with_disk_manager(DiskManagerConfig::NewOs); - + let runtime_config = RuntimeEnvBuilder::new().with_disk_manager(DiskManagerConfig::NewOs); let runtime = runtime_config.build().unwrap(); - // Create session context - let mut config = SessionConfig::new().with_coalesce_batches(true) - .with_collect_statistics(true) - .with_parquet_bloom_filter_pruning(true) - .with_parquet_page_index_pruning(true) - .with_parquet_pruning(true) - .with_prefer_existing_sort(true) - .with_repartition_file_scans(true) - .with_round_robin_repartition(true) - .with_repartition_sorts(true) - .with_batch_size(1000000) - .with_target_partitions(1); + let mut config = SessionConfig::new() + .with_coalesce_batches(true) + .with_parquet_page_index_pruning(true) + .with_prefer_existing_sort(true) + .with_repartition_file_scans(true) + .with_round_robin_repartition(true) + .with_repartition_sorts(true) + .with_batch_size(1000000) + .with_target_partitions(1); config.options_mut().execution.parquet.binary_as_string = true; - config.options_mut().execution.use_row_number_estimates_to_optimize_partitioning = true; + config + .options_mut() + .execution + .use_row_number_estimates_to_optimize_partitioning = true; config.options_mut().execution.parquet.pushdown_filters = true; config.options_mut().execution.parquet.enable_page_index = true; - config.options_mut().execution.parquet.pruning = true; config.options_mut().execution.parquet.reorder_filters = true; config.options_mut().optimizer.enable_topk_aggregation = true; + config + .options_mut() + .execution + .parquet + .schema_force_view_types = true; let state = SessionStateBuilder::new() .with_default_features() .with_config(config) .with_runtime_env(Arc::new(runtime)) .build(); - let ctx = SessionContext::new_with_state(state); - let sql = "CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '/home/ubuntu/clickbench/hits.parquet' OPTIONS ('binary_as_string' 'true')"; + let ctx = SessionContext::new_with_state(state.clone()); + let parquet_file = env::var("PARQUET_LOCATION").unwrap(); //'/home/ubuntu/clickbench/hits.parquet' + let queries_file = env::var("QUERIES_FILE").unwrap(); //'/home/ubuntu/queries.sql' + let sql = format!("CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '{parquet_file}'"); let _ = ctx.sql(&sql).await.unwrap().collect().await.unwrap(); // Read queries from file - let queries = fs::read_to_string("/home/ubuntu/queries.sql").unwrap(); - + let queries = fs::read_to_string(queries_file).unwrap(); let mut total_elapsed = 0.0; for query in queries.lines() { fs::write("/tmp/query.sql", &query).unwrap(); - - for iteration in 1..=TRIES { - + + for iteration in 1..=TRIES { // Create the query plan let df = ctx.sql(&query).await.unwrap(); - //let logical_plan = df.logical_plan().clone(); - let physical_plan = df.create_physical_plan().await.unwrap(); - + let logical_plan = df.logical_plan().clone(); + let physical_plan = state.create_physical_plan(&logical_plan).await.unwrap(); + // Add coalesce - let exec_plan: Arc = Arc::new(CoalesceBatchesExec::new(physical_plan, 1000000)); + let exec_plan: Arc = + Arc::new(CoalesceBatchesExec::new(physical_plan, 1000000)); let task_ctx = ctx.task_ctx(); - let repartitioned = Arc::new(RepartitionExec::try_new( - exec_plan, - Partitioning::RoundRobinBatch(1), - ).unwrap()); - let start = Instant::now(); - let _query_response = collect(repartitioned, task_ctx).await.unwrap(); - + let repartitioned = Arc::new( + RepartitionExec::try_new(exec_plan, Partitioning::RoundRobinBatch(1)).unwrap(), + ); + let start = Instant::now(); + let _ = collect(repartitioned, task_ctx).await.unwrap(); let elapsed = start.elapsed().as_secs_f64(); total_elapsed += elapsed; println!("Query {query_num} iteration {iteration} took {elapsed} seconds"); - } query_num += 1; } println!("Total time: {total_elapsed} seconds"); +} + +pub mod error { + use crate::{metadata::error::stream_info::MetadataError, storage::ObjectStorageError}; + use datafusion::error::DataFusionError; + + #[derive(Debug, thiserror::Error)] + pub enum ExecuteError { + #[error("Query Execution failed due to error in object storage: {0}")] + ObjectStorage(#[from] ObjectStorageError), + #[error("Query Execution failed due to error in datafusion: {0}")] + Datafusion(#[from] DataFusionError), + #[error("Query Execution failed due to error in fetching metadata: {0}")] + Metadata(#[from] MetadataError), + } +} +#[cfg(test)] +mod tests { + use serde_json::json; + + use crate::query::flatten_objects_for_count; + + #[test] + fn test_flat_simple() { + let val = vec![ + json!({ + "COUNT(*)": 1 + }), + json!({ + "COUNT(*)": 2 + }), + json!({ + "COUNT(*)": 3 + }), + ]; + + let out = flatten_objects_for_count(val); + assert_eq!(out, vec![json!({"COUNT(*)": 6})]); + } + + #[test] + fn test_flat_empty() { + let val = vec![]; + let out = flatten_objects_for_count(val.clone()); + assert_eq!(val, out); + } + + #[test] + fn test_flat_same_multi() { + let val = vec![json!({"COUNT(ALPHA)": 1}), json!({"COUNT(ALPHA)": 2})]; + let out = flatten_objects_for_count(val.clone()); + assert_eq!(vec![json!({"COUNT(ALPHA)": 3})], out); + } + + #[test] + fn test_flat_diff_multi() { + let val = vec![json!({"COUNT(ALPHA)": 1}), json!({"COUNT(BETA)": 2})]; + let out = flatten_objects_for_count(val.clone()); + assert_eq!(out, val); + } + + #[test] + fn test_flat_fail() { + let val = vec![ + json!({ + "Num": 1 + }), + json!({ + "Num": 2 + }), + json!({ + "Num": 3 + }), + ]; + + let out = flatten_objects_for_count(val.clone()); + assert_eq!(val, out); + } + + #[test] + fn test_flat_multi_key() { + let val = vec![ + json!({ + "Num": 1, + "COUNT(*)": 1 + }), + json!({ + "Num": 2, + "COUNT(*)": 2 + }), + json!({ + "Num": 3, + "COUNT(*)": 3 + }), + ]; + + let out = flatten_objects_for_count(val.clone()); + assert_eq!(val, out); + } } - fn create_sort_plan( - logical_plan: &LogicalPlan, - exec_plan: Arc, - schema: &DFSchema, - state: &SessionState, - ) -> Result> { - // Extract sort expressions from the logical plan - let sort_exprs = match logical_plan { - LogicalPlan::Sort(sort) => { - // Get sort expressions from Sort node - sort.expr.clone() - } - _ => { - // No sorting specified in query, return original plan - return Ok(exec_plan); - } - }; - - // Convert logical sort expressions to physical sort expressions - let mut physical_sort_exprs = Vec::with_capacity(sort_exprs.len()); - - for sort_expr in sort_exprs { - let physical_expr = create_physical_expr( - &sort_expr.expr, - schema, - state.execution_props(), - )?; - - physical_sort_exprs.push(PhysicalSortExpr { - expr: physical_expr, - options: SortOptions::new(false, false) - }); - } - - - - // Create sort execution plan if we have sort expressions - if !physical_sort_exprs.is_empty() { - let ordering = LexOrdering::new(physical_sort_exprs); - let sort_preserving_merge_plan = Arc::new(SortPreservingMergeExec::new(ordering.clone(), exec_plan).with_fetch(Some(10))); - Ok(Arc::new(SortExec::new(ordering, sort_preserving_merge_plan).with_preserve_partitioning(true).with_fetch(Some(10)))) - } else { - Ok(exec_plan) - } - } - - fn create_filter_plan( - logical_plan: &LogicalPlan, - exec_plan: Arc, - state: &SessionState, - ) -> Result> { - // Extract sort expressions from the logical plan - match logical_plan { - LogicalPlan::Sort(sort) => { - // Get sort expressions from Sort node - create_filter_plan(&sort.input, exec_plan, state) - } - LogicalPlan::Filter(filter) => { - let schema = exec_plan.schema(); - let expr = filter.predicate.clone(); - let df_schema = DFSchema::try_from(Arc::new(schema.as_ref().clone())).unwrap(); - let physical_expr = create_physical_expr(&expr, &df_schema, state.execution_props()).unwrap(); - let filter_exec = FilterExec::try_new(physical_expr, exec_plan).unwrap(); - return Ok(Arc::new(filter_exec)); - } - LogicalPlan::Limit(limit) => { - create_filter_plan(&limit.input, exec_plan, state) - } - LogicalPlan::Projection(proj) => { - create_filter_plan(&proj.input, exec_plan, state) - } - LogicalPlan::Aggregate(agg) => { - create_filter_plan(&agg.input, exec_plan, state) - } - - _ => { - // No sorting specified in query, return original plan - return Ok(exec_plan); - } - } - - - } - - pub mod error { - use crate::{metadata::error::stream_info::MetadataError, storage::ObjectStorageError}; - use datafusion::error::DataFusionError; - - #[derive(Debug, thiserror::Error)] - pub enum ExecuteError { - #[error("Query Execution failed due to error in object storage: {0}")] - ObjectStorage(#[from] ObjectStorageError), - #[error("Query Execution failed due to error in datafusion: {0}")] - Datafusion(#[from] DataFusionError), - #[error("Query Execution failed due to error in fetching metadata: {0}")] - Metadata(#[from] MetadataError), - } - } - - #[cfg(test)] - mod tests { - use serde_json::json; - - use crate::query::flatten_objects_for_count; - - #[test] - fn test_flat_simple() { - let val = vec![ - json!({ - "COUNT(*)": 1 - }), - json!({ - "COUNT(*)": 2 - }), - json!({ - "COUNT(*)": 3 - }), - ]; - - let out = flatten_objects_for_count(val); - assert_eq!(out, vec![json!({"COUNT(*)": 6})]); - } - - #[test] - fn test_flat_empty() { - let val = vec![]; - let out = flatten_objects_for_count(val.clone()); - assert_eq!(val, out); - } - - #[test] - fn test_flat_same_multi() { - let val = vec![json!({"COUNT(ALPHA)": 1}), json!({"COUNT(ALPHA)": 2})]; - let out = flatten_objects_for_count(val.clone()); - assert_eq!(vec![json!({"COUNT(ALPHA)": 3})], out); - } - - #[test] - fn test_flat_diff_multi() { - let val = vec![json!({"COUNT(ALPHA)": 1}), json!({"COUNT(BETA)": 2})]; - let out = flatten_objects_for_count(val.clone()); - assert_eq!(out, val); - } - - #[test] - fn test_flat_fail() { - let val = vec![ - json!({ - "Num": 1 - }), - json!({ - "Num": 2 - }), - json!({ - "Num": 3 - }), - ]; - - let out = flatten_objects_for_count(val.clone()); - assert_eq!(val, out); - } - - #[test] - fn test_flat_multi_key() { - let val = vec![ - json!({ - "Num": 1, - "COUNT(*)": 1 - }), - json!({ - "Num": 2, - "COUNT(*)": 2 - }), - json!({ - "Num": 3, - "COUNT(*)": 3 - }), - ]; - - let out = flatten_objects_for_count(val.clone()); - assert_eq!(val, out); - } - } - \ No newline at end of file From b59b47092bab5068b133252acddf128602823f1a Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Tue, 25 Feb 2025 21:51:23 -0500 Subject: [PATCH 24/32] datafusion-cli style --- Cargo.lock | 83 ++++++++ Cargo.toml | 1 + src/query/catalog.rs | 10 +- src/query/cli_context.rs | 92 +++++++++ src/query/exec.rs | 429 +++++++++++++++++++++++++++++++++++++++ src/query/functions.rs | 130 ------------ src/query/helper.rs | 225 ++++++++++++++++++++ src/query/highlighter.rs | 127 ++++++++++++ src/query/mod.rs | 105 ++++------ 9 files changed, 999 insertions(+), 203 deletions(-) create mode 100644 src/query/cli_context.rs create mode 100644 src/query/exec.rs create mode 100644 src/query/helper.rs create mode 100644 src/query/highlighter.rs diff --git a/Cargo.lock b/Cargo.lock index 73dbd3b24..8501453c4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1458,6 +1458,15 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" +[[package]] +name = "clipboard-win" +version = "5.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15efe7a882b08f34e38556b14f2fb3daa98769d06c7f0c1b076dfd0d983bc892" +dependencies = [ + "error-code", +] + [[package]] name = "clokwerk" version = "0.4.0" @@ -2244,6 +2253,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "endian-type" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" + [[package]] name = "equivalent" version = "1.0.1" @@ -2260,6 +2275,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "error-code" +version = "3.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5d9305ccc6942a704f4335694ecd3de2ea531b114ac2d51f5f843750787a92f" + [[package]] name = "fastrand" version = "1.9.0" @@ -2275,6 +2296,17 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "fd-lock" +version = "4.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e5768da2206272c81ef0b5e951a41862938a6070da63bcea197899942d3b947" +dependencies = [ + "cfg-if", + "rustix", + "windows-sys 0.52.0", +] + [[package]] name = "fixedbitset" version = "0.5.7" @@ -2586,6 +2618,15 @@ dependencies = [ "digest", ] +[[package]] +name = "home" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" +dependencies = [ + "windows-sys 0.59.0", +] + [[package]] name = "hostname" version = "0.4.0" @@ -3343,6 +3384,15 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "nibble_vec" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43" +dependencies = [ + "smallvec", +] + [[package]] name = "nix" version = "0.29.0" @@ -3791,6 +3841,7 @@ dependencies = [ "rstest", "rustls 0.22.4", "rustls-pemfile 2.2.0", + "rustyline", "semver", "serde", "serde_json", @@ -4191,6 +4242,16 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "radix_trie" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd" +dependencies = [ + "endian-type", + "nibble_vec", +] + [[package]] name = "rand" version = "0.8.5" @@ -4697,6 +4758,28 @@ version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" +[[package]] +name = "rustyline" +version = "15.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ee1e066dc922e513bda599c6ccb5f3bb2b0ea5870a579448f2622993f0a9a2f" +dependencies = [ + "bitflags 2.8.0", + "cfg-if", + "clipboard-win", + "fd-lock", + "home", + "libc", + "log", + "memchr", + "nix", + "radix_trie", + "unicode-segmentation", + "unicode-width", + "utf8parse", + "windows-sys 0.59.0", +] + [[package]] name = "ryu" version = "1.0.19" diff --git a/Cargo.toml b/Cargo.toml index 2130770f1..b08dc6994 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -139,6 +139,7 @@ aws-credential-types = "1.2.1" sqlparser = "0.54.0" nix = {version = "0.29.0", features = ["fs", "mman"]} libc = "0.2.169" +rustyline = "15.0.0" [build-dependencies] cargo_toml = "0.20.1" diff --git a/src/query/catalog.rs b/src/query/catalog.rs index 9d713a188..494ef95ef 100644 --- a/src/query/catalog.rs +++ b/src/query/catalog.rs @@ -23,11 +23,11 @@ pub struct DynamicObjectStoreCatalog { state: Weak>, } -// impl DynamicObjectStoreCatalog { -// pub fn new(inner: Arc, state: Weak>) -> Self { -// Self { inner, state } -// } -// } +impl DynamicObjectStoreCatalog { + pub fn new(inner: Arc, state: Weak>) -> Self { + Self { inner, state } + } +} impl CatalogProviderList for DynamicObjectStoreCatalog { fn as_any(&self) -> &dyn Any { diff --git a/src/query/cli_context.rs b/src/query/cli_context.rs new file mode 100644 index 000000000..152f084f3 --- /dev/null +++ b/src/query/cli_context.rs @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::{ + dataframe::DataFrame, + error::DataFusionError, + execution::{context::SessionState, TaskContext}, + logical_expr::LogicalPlan, + prelude::SessionContext, +}; +use object_store::ObjectStore; + +use super::object_storage::{AwsOptions, GcpOptions}; + +#[async_trait::async_trait] +/// The CLI session context trait provides a way to have a session context that can be used with datafusion's CLI code. +pub trait CliSessionContext { + /// Get an atomic reference counted task context. + fn task_ctx(&self) -> Arc; + + /// Get the session state. + fn session_state(&self) -> SessionState; + + /// Register an object store with the session context. + fn register_object_store( + &self, + url: &url::Url, + object_store: Arc, + ) -> Option>; + + /// Register table options extension from scheme. + fn register_table_options_extension_from_scheme(&self, scheme: &str); + + /// Execute a logical plan and return a DataFrame. + async fn execute_logical_plan(&self, plan: LogicalPlan) -> Result; +} + +#[async_trait::async_trait] +impl CliSessionContext for SessionContext { + fn task_ctx(&self) -> Arc { + self.task_ctx() + } + + fn session_state(&self) -> SessionState { + self.state() + } + + fn register_object_store( + &self, + url: &url::Url, + object_store: Arc, + ) -> Option> { + self.register_object_store(url, object_store) + } + + fn register_table_options_extension_from_scheme(&self, scheme: &str) { + match scheme { + // For Amazon S3 or Alibaba Cloud OSS + "s3" | "oss" | "cos" => { + // Register AWS specific table options in the session context: + self.register_table_options_extension(AwsOptions::default()) + } + // For Google Cloud Storage + "gs" | "gcs" => { + // Register GCP specific table options in the session context: + self.register_table_options_extension(GcpOptions::default()) + } + // For unsupported schemes, do nothing: + _ => {} + } + } + + async fn execute_logical_plan(&self, plan: LogicalPlan) -> Result { + self.execute_logical_plan(plan).await + } +} diff --git a/src/query/exec.rs b/src/query/exec.rs new file mode 100644 index 000000000..5ef8b2414 --- /dev/null +++ b/src/query/exec.rs @@ -0,0 +1,429 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Execution functions + +use std::collections::HashMap; +use std::time::Instant; + +use super::cli_context::CliSessionContext; +use super::object_storage::get_object_store; +use datafusion::common::{plan_datafusion_err, plan_err}; +use datafusion::config::ConfigFileType; +use datafusion::datasource::listing::ListingTableUrl; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_expr::{DdlStatement, LogicalPlan}; +use datafusion::physical_plan::execution_plan::EmissionType; +use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties}; +use datafusion::sql::parser::{DFParser, Statement}; +use datafusion::sql::sqlparser::dialect::dialect_from_str; + +pub async fn exec_from_commands( + ctx: &dyn CliSessionContext, + commands: Vec, + base_command: bool, +) -> Result<()> { + if !base_command { + const TRIES: usize = 3; + let mut query_num = 1; + let mut total_elapsed_per_iteration = vec![0.0; TRIES]; + for sql in commands.clone() { + for iteration in 1..=TRIES { + let start = Instant::now(); + exec_and_print(ctx, sql.clone()).await?; + let elapsed = start.elapsed().as_secs_f64(); + total_elapsed_per_iteration[iteration - 1] += elapsed; + println!("Query {query_num} iteration {iteration} took {elapsed} seconds"); + } + query_num += 1; + } + for (iteration, total_elapsed) in total_elapsed_per_iteration.iter().enumerate() { + println!( + "Total time for iteration {}: {} seconds", + iteration + 1, + total_elapsed + ); + } + } + exec_and_print(ctx, commands[0].clone()).await?; + + Ok(()) +} + +pub(super) async fn exec_and_print(ctx: &dyn CliSessionContext, sql: String) -> Result<()> { + let task_ctx = ctx.task_ctx(); + let dialect = &task_ctx.session_config().options().sql_parser.dialect; + let dialect = dialect_from_str(dialect).ok_or_else(|| { + plan_datafusion_err!( + "Unsupported SQL dialect: {dialect}. Available dialects: \ + Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, \ + MsSQL, ClickHouse, BigQuery, Ansi." + ) + })?; + + let statements = DFParser::parse_sql_with_dialect(&sql, dialect.as_ref())?; + for statement in statements { + let plan = create_plan(ctx, statement).await?; + + let df = ctx.execute_logical_plan(plan).await?; + let physical_plan = df.create_physical_plan().await?; + + if physical_plan.boundedness().is_unbounded() { + if physical_plan.pipeline_behavior() == EmissionType::Final { + return plan_err!( + "The given query can generate a valid result only once \ + the source finishes, but the source is unbounded" + ); + } + // As the input stream comes, we can generate results. + // However, memory safety is not guaranteed. + let _ = execute_stream(physical_plan, task_ctx.clone())?; + } else { + // Bounded stream; collected results are printed after all input consumed. + let _ = collect(physical_plan, task_ctx.clone()).await?; + } + } + + Ok(()) +} + +fn config_file_type_from_str(ext: &str) -> Option { + match ext.to_lowercase().as_str() { + "csv" => Some(ConfigFileType::CSV), + "json" => Some(ConfigFileType::JSON), + "parquet" => Some(ConfigFileType::PARQUET), + _ => None, + } +} + +async fn create_plan( + ctx: &dyn CliSessionContext, + statement: Statement, +) -> Result { + let mut plan = ctx.session_state().statement_to_plan(statement).await?; + + // Note that cmd is a mutable reference so that create_external_table function can remove all + // datafusion-cli specific options before passing through to datafusion. Otherwise, datafusion + // will raise Configuration errors. + if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { + // To support custom formats, treat error as None + let format = config_file_type_from_str(&cmd.file_type); + register_object_store_and_config_extensions(ctx, &cmd.location, &cmd.options, format) + .await?; + } + + if let LogicalPlan::Copy(copy_to) = &mut plan { + let format = config_file_type_from_str(©_to.file_type.get_ext()); + + register_object_store_and_config_extensions( + ctx, + ©_to.output_url, + ©_to.options, + format, + ) + .await?; + } + Ok(plan) +} + +/// Asynchronously registers an object store and its configuration extensions +/// to the session context. +/// +/// This function dynamically registers a cloud object store based on the given +/// location and options. It first parses the location to determine the scheme +/// and constructs the URL accordingly. Depending on the scheme, it also registers +/// relevant options. The function then alters the default table options with the +/// given custom options. Finally, it retrieves and registers the object store +/// in the session context. +/// +/// # Parameters +/// +/// * `ctx`: A reference to the `SessionContext` for registering the object store. +/// * `location`: A string reference representing the location of the object store. +/// * `options`: A reference to a hash map containing configuration options for +/// the object store. +/// +/// # Returns +/// +/// A `Result<()>` which is an Ok value indicating successful registration, or +/// an error upon failure. +/// +/// # Errors +/// +/// This function can return an error if the location parsing fails, options +/// alteration fails, or if the object store cannot be retrieved and registered +/// successfully. +pub(crate) async fn register_object_store_and_config_extensions( + ctx: &dyn CliSessionContext, + location: &String, + options: &HashMap, + format: Option, +) -> Result<()> { + // Parse the location URL to extract the scheme and other components + let table_path = ListingTableUrl::parse(location)?; + + // Extract the scheme (e.g., "s3", "gcs") from the parsed URL + let scheme = table_path.scheme(); + + // Obtain a reference to the URL + let url = table_path.as_ref(); + + // Register the options based on the scheme extracted from the location + ctx.register_table_options_extension_from_scheme(scheme); + + // Clone and modify the default table options based on the provided options + let mut table_options = ctx.session_state().default_table_options(); + if let Some(format) = format { + table_options.set_config_format(format); + } + table_options.alter_with_string_hash_map(options)?; + + // Retrieve the appropriate object store based on the scheme, URL, and modified table options + let store = get_object_store(&ctx.session_state(), scheme, url, &table_options).await?; + + // Register the retrieved object store in the session context's runtime environment + ctx.register_object_store(url, store); + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + use datafusion::common::plan_err; + + use datafusion::prelude::SessionContext; + use url::Url; + + async fn create_external_table_test(location: &str, sql: &str) -> Result<()> { + let ctx = SessionContext::new(); + let plan = ctx.state().create_logical_plan(sql).await?; + + if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { + let format = config_file_type_from_str(&cmd.file_type); + register_object_store_and_config_extensions(&ctx, &cmd.location, &cmd.options, format) + .await?; + } else { + return plan_err!("LogicalPlan is not a CreateExternalTable"); + } + + // Ensure the URL is supported by the object store + ctx.runtime_env() + .object_store(ListingTableUrl::parse(location)?)?; + + Ok(()) + } + + async fn copy_to_table_test(location: &str, sql: &str) -> Result<()> { + let ctx = SessionContext::new(); + // AWS CONFIG register. + + let plan = ctx.state().create_logical_plan(sql).await?; + + if let LogicalPlan::Copy(cmd) = &plan { + let format = config_file_type_from_str(&cmd.file_type.get_ext()); + register_object_store_and_config_extensions( + &ctx, + &cmd.output_url, + &cmd.options, + format, + ) + .await?; + } else { + return plan_err!("LogicalPlan is not a CreateExternalTable"); + } + + // Ensure the URL is supported by the object store + ctx.runtime_env() + .object_store(ListingTableUrl::parse(location)?)?; + + Ok(()) + } + + #[tokio::test] + async fn create_object_store_table_http() -> Result<()> { + // Should be OK + let location = "http://example.com/file.parquet"; + let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET LOCATION '{location}'"); + create_external_table_test(location, &sql).await?; + + Ok(()) + } + #[tokio::test] + async fn copy_to_external_object_store_test() -> Result<()> { + let locations = vec![ + "s3://bucket/path/file.parquet", + "oss://bucket/path/file.parquet", + "cos://bucket/path/file.parquet", + "gcs://bucket/path/file.parquet", + ]; + let ctx = SessionContext::new(); + let task_ctx = ctx.task_ctx(); + let dialect = &task_ctx.session_config().options().sql_parser.dialect; + let dialect = dialect_from_str(dialect).ok_or_else(|| { + plan_datafusion_err!( + "Unsupported SQL dialect: {dialect}. Available dialects: \ + Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, \ + MsSQL, ClickHouse, BigQuery, Ansi." + ) + })?; + for location in locations { + let sql = format!("copy (values (1,2)) to '{}' STORED AS PARQUET;", location); + let statements = DFParser::parse_sql_with_dialect(&sql, dialect.as_ref())?; + for statement in statements { + //Should not fail + let mut plan = create_plan(&ctx, statement).await?; + if let LogicalPlan::Copy(copy_to) = &mut plan { + assert_eq!(copy_to.output_url, location); + assert_eq!(copy_to.file_type.get_ext(), "parquet".to_string()); + ctx.runtime_env() + .object_store_registry + .get_store(&Url::parse(©_to.output_url).unwrap())?; + } else { + return plan_err!("LogicalPlan is not a CopyTo"); + } + } + } + Ok(()) + } + + #[tokio::test] + async fn copy_to_object_store_table_s3() -> Result<()> { + let access_key_id = "fake_access_key_id"; + let secret_access_key = "fake_secret_access_key"; + let location = "s3://bucket/path/file.parquet"; + + // Missing region, use object_store defaults + let sql = format!("COPY (values (1,2)) TO '{location}' STORED AS PARQUET + OPTIONS ('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}')"); + copy_to_table_test(location, &sql).await?; + + Ok(()) + } + + #[tokio::test] + async fn create_object_store_table_s3() -> Result<()> { + let access_key_id = "fake_access_key_id"; + let secret_access_key = "fake_secret_access_key"; + let region = "fake_us-east-2"; + let session_token = "fake_session_token"; + let location = "s3://bucket/path/file.parquet"; + + // Missing region, use object_store defaults + let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET + OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}') LOCATION '{location}'"); + create_external_table_test(location, &sql).await?; + + // Should be OK + let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET + OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.region' '{region}', 'aws.session_token' '{session_token}') LOCATION '{location}'"); + create_external_table_test(location, &sql).await?; + + Ok(()) + } + + #[tokio::test] + async fn create_object_store_table_oss() -> Result<()> { + let access_key_id = "fake_access_key_id"; + let secret_access_key = "fake_secret_access_key"; + let endpoint = "fake_endpoint"; + let location = "oss://bucket/path/file.parquet"; + + // Should be OK + let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET + OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.oss.endpoint' '{endpoint}') LOCATION '{location}'"); + create_external_table_test(location, &sql).await?; + + Ok(()) + } + + #[tokio::test] + async fn create_object_store_table_cos() -> Result<()> { + let access_key_id = "fake_access_key_id"; + let secret_access_key = "fake_secret_access_key"; + let endpoint = "fake_endpoint"; + let location = "cos://bucket/path/file.parquet"; + + // Should be OK + let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET + OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.cos.endpoint' '{endpoint}') LOCATION '{location}'"); + create_external_table_test(location, &sql).await?; + + Ok(()) + } + + #[tokio::test] + async fn create_object_store_table_gcs() -> Result<()> { + let service_account_path = "fake_service_account_path"; + let service_account_key = + "{\"private_key\": \"fake_private_key.pem\",\"client_email\":\"fake_client_email\", \"private_key_id\":\"id\"}"; + let application_credentials_path = "fake_application_credentials_path"; + let location = "gcs://bucket/path/file.parquet"; + + // for service_account_path + let sql = format!( + "CREATE EXTERNAL TABLE test STORED AS PARQUET + OPTIONS('gcp.service_account_path' '{service_account_path}') LOCATION '{location}'" + ); + let err = create_external_table_test(location, &sql) + .await + .unwrap_err(); + assert!(err.to_string().contains("os error 2")); + + // for service_account_key + let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('gcp.service_account_key' '{service_account_key}') LOCATION '{location}'"); + let err = create_external_table_test(location, &sql) + .await + .unwrap_err() + .to_string(); + assert!(err.contains("No RSA key found in pem file"), "{err}"); + + // for application_credentials_path + let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET + OPTIONS('gcp.application_credentials_path' '{application_credentials_path}') LOCATION '{location}'"); + let err = create_external_table_test(location, &sql) + .await + .unwrap_err(); + assert!(err.to_string().contains("os error 2")); + + Ok(()) + } + + #[tokio::test] + async fn create_external_table_local_file() -> Result<()> { + let location = "path/to/file.parquet"; + + // Ensure that local files are also registered + let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET LOCATION '{location}'"); + create_external_table_test(location, &sql).await.unwrap(); + + Ok(()) + } + + #[tokio::test] + async fn create_external_table_format_option() -> Result<()> { + let location = "path/to/file.cvs"; + + // Test with format options + let sql = + format!("CREATE EXTERNAL TABLE test STORED AS CSV LOCATION '{location}' OPTIONS('format.has_header' 'true')"); + create_external_table_test(location, &sql).await.unwrap(); + + Ok(()) + } +} diff --git a/src/query/functions.rs b/src/query/functions.rs index 55d4841eb..2864624b3 100644 --- a/src/query/functions.rs +++ b/src/query/functions.rs @@ -52,122 +52,6 @@ pub enum Function { DropTable, } -// const ALL_FUNCTIONS: [Function; 7] = [ -// Function::CreateTable, -// Function::CreateTableAs, -// Function::DropTable, -// Function::Explain, -// Function::Insert, -// Function::Select, -// Function::Show, -// ]; - -impl Function { - // pub fn function_details(&self) -> Result<&str> { - // let details = match self { - // Function::Select => { - // r#" - // Command: SELECT - // Description: retrieve rows from a table or view - // Syntax: - // SELECT [ ALL | DISTINCT [ ON ( expression [, ...] ) ] ] - // [ * | expression [ [ AS ] output_name ] [, ...] ] - // [ FROM from_item [, ...] ] - // [ WHERE condition ] - // [ GROUP BY [ ALL | DISTINCT ] grouping_element [, ...] ] - // [ HAVING condition ] - // [ WINDOW window_name AS ( window_definition ) [, ...] ] - // [ { UNION | INTERSECT | EXCEPT } [ ALL | DISTINCT ] select ] - // [ ORDER BY expression [ ASC | DESC | USING operator ] [ NULLS { FIRST | LAST } ] [, ...] ] - // [ LIMIT { count | ALL } ] - // [ OFFSET start [ ROW | ROWS ] ] - - // where from_item can be one of: - - // [ ONLY ] table_name [ * ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] - // [ TABLESAMPLE sampling_method ( argument [, ...] ) [ REPEATABLE ( seed ) ] ] - // [ LATERAL ] ( select ) [ AS ] alias [ ( column_alias [, ...] ) ] - // with_query_name [ [ AS ] alias [ ( column_alias [, ...] ) ] ] - // [ LATERAL ] function_name ( [ argument [, ...] ] ) - // [ WITH ORDINALITY ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] - // [ LATERAL ] function_name ( [ argument [, ...] ] ) [ AS ] alias ( column_definition [, ...] ) - // [ LATERAL ] function_name ( [ argument [, ...] ] ) AS ( column_definition [, ...] ) - // [ LATERAL ] ROWS FROM( function_name ( [ argument [, ...] ] ) [ AS ( column_definition [, ...] ) ] [, ...] ) - // [ WITH ORDINALITY ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] - // from_item [ NATURAL ] join_type from_item [ ON join_condition | USING ( join_column [, ...] ) [ AS join_using_alias ] ] - - // and grouping_element can be one of: - - // ( ) - // expression - // ( expression [, ...] ) - - // and with_query is: - - // with_query_name [ ( column_name [, ...] ) ] AS [ [ NOT ] MATERIALIZED ] ( select | values | insert | update | delete ) - - // TABLE [ ONLY ] table_name [ * ]"# - // } - // Function::Explain => { - // r#" - // Command: EXPLAIN - // Description: show the execution plan of a statement - // Syntax: - // EXPLAIN [ ANALYZE ] statement - // "# - // } - // Function::Show => { - // r#" - // Command: SHOW - // Description: show the value of a run-time parameter - // Syntax: - // SHOW name - // "# - // } - // Function::CreateTable => { - // r#" - // Command: CREATE TABLE - // Description: define a new table - // Syntax: - // CREATE [ EXTERNAL ] TABLE table_name ( [ - // { column_name data_type } - // [, ... ] - // ] ) - // "# - // } - // Function::CreateTableAs => { - // r#" - // Command: CREATE TABLE AS - // Description: define a new table from the results of a query - // Syntax: - // CREATE TABLE table_name - // [ (column_name [, ...] ) ] - // AS query - // [ WITH [ NO ] DATA ] - // "# - // } - // Function::Insert => { - // r#" - // Command: INSERT - // Description: create new rows in a table - // Syntax: - // INSERT INTO table_name [ ( column_name [, ...] ) ] - // { VALUES ( { expression } [, ...] ) [, ...] } - // "# - // } - // Function::DropTable => { - // r#" - // Command: DROP TABLE - // Description: remove a table - // Syntax: - // DROP TABLE [ IF EXISTS ] name [, ...] - // "# - // } - // }; - // Ok(details) - // } -} - impl FromStr for Function { type Err = (); @@ -199,20 +83,6 @@ impl fmt::Display for Function { } } -// pub fn display_all_functions() -> Result<()> { -// println!("Available help:"); -// let array = StringArray::from( -// ALL_FUNCTIONS -// .iter() -// .map(|f| format!("{}", f)) -// .collect::>(), -// ); -// let schema = Schema::new(vec![Field::new("Function", DataType::Utf8, false)]); -// let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)])?; -// println!("{}", pretty_format_batches(&[batch]).unwrap()); -// Ok(()) -// } - /// PARQUET_META table function #[derive(Debug)] struct ParquetMetadataTable { diff --git a/src/query/helper.rs b/src/query/helper.rs new file mode 100644 index 000000000..e2383197c --- /dev/null +++ b/src/query/helper.rs @@ -0,0 +1,225 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Helper that helps with interactive editing, including multi-line parsing and validation, +//! and auto-completion for file name during creating external table. + +use std::borrow::Cow; + +use super::highlighter::{NoSyntaxHighlighter, SyntaxHighlighter}; + +use datafusion::common::sql_datafusion_err; +use datafusion::error::DataFusionError; +use datafusion::sql::parser::{DFParser, Statement}; +use datafusion::sql::sqlparser::dialect::dialect_from_str; +use datafusion::sql::sqlparser::parser::ParserError; + +use rustyline::completion::{Completer, FilenameCompleter, Pair}; +use rustyline::error::ReadlineError; +use rustyline::highlight::{CmdKind, Highlighter}; +use rustyline::hint::Hinter; +use rustyline::validate::{ValidationContext, ValidationResult, Validator}; +use rustyline::{Context, Helper, Result}; + +pub struct CliHelper { + completer: FilenameCompleter, + dialect: String, + highlighter: Box, +} + +impl CliHelper { + pub fn new(dialect: &str, color: bool) -> Self { + let highlighter: Box = if !color { + Box::new(NoSyntaxHighlighter {}) + } else { + Box::new(SyntaxHighlighter::new(dialect)) + }; + Self { + completer: FilenameCompleter::new(), + dialect: dialect.into(), + highlighter, + } + } + + fn validate_input(&self, input: &str) -> Result { + if let Some(sql) = input.strip_suffix(';') { + let sql = match unescape_input(sql) { + Ok(sql) => sql, + Err(err) => { + return Ok(ValidationResult::Invalid(Some(format!( + " 🤔 Invalid statement: {err}", + )))) + } + }; + + let dialect = match dialect_from_str(&self.dialect) { + Some(dialect) => dialect, + None => { + return Ok(ValidationResult::Invalid(Some(format!( + " 🤔 Invalid dialect: {}", + self.dialect + )))) + } + }; + let lines = split_from_semicolon(sql); + for line in lines { + match DFParser::parse_sql_with_dialect(&line, dialect.as_ref()) { + Ok(statements) if statements.is_empty() => { + return Ok(ValidationResult::Invalid(Some( + " 🤔 You entered an empty statement".to_string(), + ))); + } + Ok(_statements) => {} + Err(err) => { + return Ok(ValidationResult::Invalid(Some(format!( + " 🤔 Invalid statement: {err}", + )))); + } + } + } + Ok(ValidationResult::Valid(None)) + } else if input.starts_with('\\') { + // command + Ok(ValidationResult::Valid(None)) + } else { + Ok(ValidationResult::Incomplete) + } + } +} + +impl Default for CliHelper { + fn default() -> Self { + Self::new("generic", false) + } +} + +impl Highlighter for CliHelper { + fn highlight<'l>(&self, line: &'l str, pos: usize) -> Cow<'l, str> { + self.highlighter.highlight(line, pos) + } + + fn highlight_char(&self, line: &str, pos: usize, kind: CmdKind) -> bool { + self.highlighter.highlight_char(line, pos, kind) + } +} + +impl Hinter for CliHelper { + type Hint = String; +} + +/// returns true if the current position is after the open quote for +/// creating an external table. +fn is_open_quote_for_location(line: &str, pos: usize) -> bool { + let mut sql = line[..pos].to_string(); + sql.push('\''); + if let Ok(stmts) = DFParser::parse_sql(&sql) { + if let Some(Statement::CreateExternalTable(_)) = stmts.back() { + return true; + } + } + false +} + +impl Completer for CliHelper { + type Candidate = Pair; + + fn complete( + &self, + line: &str, + pos: usize, + ctx: &Context<'_>, + ) -> std::result::Result<(usize, Vec), ReadlineError> { + if is_open_quote_for_location(line, pos) { + self.completer.complete(line, pos, ctx) + } else { + Ok((0, Vec::with_capacity(0))) + } + } +} + +impl Validator for CliHelper { + fn validate(&self, ctx: &mut ValidationContext<'_>) -> Result { + let input = ctx.input().trim_end(); + self.validate_input(input) + } +} + +impl Helper for CliHelper {} + +/// Unescape input string from readline. +/// +/// The data read from stdio will be escaped, so we need to unescape the input before executing the input +pub fn unescape_input(input: &str) -> datafusion::error::Result { + let mut chars = input.chars(); + + let mut result = String::with_capacity(input.len()); + while let Some(char) = chars.next() { + if char == '\\' { + if let Some(next_char) = chars.next() { + // https://static.rust-lang.org/doc/master/reference.html#literals + result.push(match next_char { + '0' => '\0', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + '\\' => '\\', + _ => { + return Err(sql_datafusion_err!(ParserError::TokenizerError(format!( + "unsupported escape char: '\\{}'", + next_char + )))) + } + }); + } + } else { + result.push(char); + } + } + + Ok(result) +} + +/// Splits a string which consists of multiple queries. +pub(crate) fn split_from_semicolon(sql: String) -> Vec { + let mut commands = Vec::new(); + let mut current_command = String::new(); + let mut in_single_quote = false; + let mut in_double_quote = false; + + for c in sql.chars() { + if c == '\'' && !in_double_quote { + in_single_quote = !in_single_quote; + } else if c == '"' && !in_single_quote { + in_double_quote = !in_double_quote; + } + + if c == ';' && !in_single_quote && !in_double_quote { + if !current_command.trim().is_empty() { + commands.push(format!("{};", current_command.trim())); + current_command.clear(); + } + } else { + current_command.push(c); + } + } + + if !current_command.trim().is_empty() { + commands.push(format!("{};", current_command.trim())); + } + + commands +} diff --git a/src/query/highlighter.rs b/src/query/highlighter.rs new file mode 100644 index 000000000..f3e13ed5c --- /dev/null +++ b/src/query/highlighter.rs @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! The syntax highlighter. + +use std::{ + borrow::Cow::{self, Borrowed}, + fmt::Display, +}; + +use datafusion::sql::sqlparser::{ + dialect::{dialect_from_str, Dialect, GenericDialect}, + keywords::Keyword, + tokenizer::{Token, Tokenizer}, +}; +use rustyline::highlight::{CmdKind, Highlighter}; + +/// The syntax highlighter. +#[derive(Debug)] +pub struct SyntaxHighlighter { + dialect: Box, +} + +impl SyntaxHighlighter { + pub fn new(dialect: &str) -> Self { + let dialect = dialect_from_str(dialect).unwrap_or(Box::new(GenericDialect {})); + Self { dialect } + } +} + +pub struct NoSyntaxHighlighter {} + +impl Highlighter for NoSyntaxHighlighter {} + +impl Highlighter for SyntaxHighlighter { + fn highlight<'l>(&self, line: &'l str, _: usize) -> Cow<'l, str> { + let mut out_line = String::new(); + + // `with_unescape(false)` since we want to rebuild the original string. + let mut tokenizer = Tokenizer::new(self.dialect.as_ref(), line).with_unescape(false); + let tokens = tokenizer.tokenize(); + match tokens { + Ok(tokens) => { + for token in tokens.iter() { + match token { + Token::Word(w) if w.keyword != Keyword::NoKeyword => { + out_line.push_str(&Color::red(token)); + } + Token::SingleQuotedString(_) => { + out_line.push_str(&Color::green(token)); + } + other => out_line.push_str(&format!("{other}")), + } + } + out_line.into() + } + Err(_) => Borrowed(line), + } + } + + fn highlight_char(&self, line: &str, _pos: usize, _cmd: CmdKind) -> bool { + !line.is_empty() + } +} + +/// Convenient utility to return strings with [ANSI color](https://gist.github.com/JBlond/2fea43a3049b38287e5e9cefc87b2124). +struct Color {} + +impl Color { + fn green(s: impl Display) -> String { + format!("\x1b[92m{s}\x1b[0m") + } + + fn red(s: impl Display) -> String { + format!("\x1b[91m{s}\x1b[0m") + } +} + +#[cfg(test)] +mod tests { + use super::SyntaxHighlighter; + use rustyline::highlight::Highlighter; + + #[test] + fn highlighter_valid() { + let s = "SElect col_a from tab_1;"; + let highlighter = SyntaxHighlighter::new("generic"); + let out = highlighter.highlight(s, s.len()); + assert_eq!( + "\u{1b}[91mSElect\u{1b}[0m col_a \u{1b}[91mfrom\u{1b}[0m tab_1;", + out + ); + } + + #[test] + fn highlighter_valid_with_new_line() { + let s = "SElect col_a from tab_1\n WHERE col_b = 'なにか';"; + let highlighter = SyntaxHighlighter::new("generic"); + let out = highlighter.highlight(s, s.len()); + assert_eq!( + "\u{1b}[91mSElect\u{1b}[0m col_a \u{1b}[91mfrom\u{1b}[0m tab_1\n \u{1b}[91mWHERE\u{1b}[0m col_b = \u{1b}[92m'なにか'\u{1b}[0m;", + out + ); + } + + #[test] + fn highlighter_invalid() { + let s = "SElect col_a from tab_1 WHERE col_b = ';"; + let highlighter = SyntaxHighlighter::new("generic"); + let out = highlighter.highlight(s, s.len()); + assert_eq!("SElect col_a from tab_1 WHERE col_b = ';", out); + } +} diff --git a/src/query/mod.rs b/src/query/mod.rs index 332170343..46039425f 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -17,12 +17,18 @@ */ pub mod catalog; +pub mod exec; mod filter_optimizer; pub mod functions; +pub mod helper; +pub mod highlighter; mod listing_table_builder; pub mod object_storage; pub mod stream_schema_provider; +pub mod cli_context; + +use catalog::DynamicObjectStoreCatalog; use chrono::NaiveDateTime; use chrono::{DateTime, Duration, Utc}; use datafusion::arrow::record_batch::RecordBatch; @@ -36,10 +42,8 @@ use datafusion::logical_expr::expr::Alias; use datafusion::logical_expr::{ Aggregate, Explain, Filter, LogicalPlan, PlanType, Projection, ToStringifiedPlan, }; -use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; -use datafusion::physical_plan::repartition::RepartitionExec; -use datafusion::physical_plan::{collect, ExecutionPlan, Partitioning}; use datafusion::prelude::*; +use functions::ParquetMetadataFunc; use itertools::Itertools; use once_cell::sync::Lazy; use relative_path::RelativePathBuf; @@ -47,7 +51,6 @@ use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; use std::ops::Bound; use std::sync::Arc; -use std::time::Instant; use stream_schema_provider::collect_manifest_files; use sysinfo::System; @@ -624,75 +627,41 @@ pub fn flatten_objects_for_count(objects: Vec) -> Vec { } } -pub async fn run_benchmark() { - const TRIES: usize = 1; - let mut query_num = 1; - let runtime_config = RuntimeEnvBuilder::new().with_disk_manager(DiskManagerConfig::NewOs); - - let runtime = runtime_config.build().unwrap(); - - // Create session context - let mut config = SessionConfig::new() - .with_coalesce_batches(true) - .with_parquet_page_index_pruning(true) - .with_prefer_existing_sort(true) - .with_repartition_file_scans(true) - .with_round_robin_repartition(true) - .with_repartition_sorts(true) - .with_batch_size(1000000) - .with_target_partitions(1); - config.options_mut().execution.parquet.binary_as_string = true; - config - .options_mut() - .execution - .use_row_number_estimates_to_optimize_partitioning = true; - config.options_mut().execution.parquet.pushdown_filters = true; - config.options_mut().execution.parquet.enable_page_index = true; - config.options_mut().execution.parquet.reorder_filters = true; - config.options_mut().optimizer.enable_topk_aggregation = true; - config - .options_mut() - .execution - .parquet - .schema_force_view_types = true; - let state = SessionStateBuilder::new() - .with_default_features() - .with_config(config) - .with_runtime_env(Arc::new(runtime)) - .build(); - let ctx = SessionContext::new_with_state(state.clone()); +pub async fn run_benchmark() -> Result<(), ExecuteError> { + let mut session_config = SessionConfig::from_env()?.with_information_schema(true); + + session_config = session_config.with_batch_size(8192); + + let rt_builder = RuntimeEnvBuilder::new(); + // set memory pool size + let runtime_env = rt_builder.build_arc()?; + + // enable dynamic file query + let ctx = SessionContext::new_with_config_rt(session_config, runtime_env).enable_url_table(); + // install dynamic catalog provider that can register required object stores + ctx.register_catalog_list(Arc::new(DynamicObjectStoreCatalog::new( + ctx.state().catalog_list().clone(), + ctx.state_weak_ref(), + ))); + // register `parquet_metadata` table function to get metadata from parquet files + ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {})); + let parquet_file = env::var("PARQUET_LOCATION").unwrap(); //'/home/ubuntu/clickbench/hits.parquet' + + let base_command = + format!("CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '{parquet_file}' OPTIONS ('binary_as_string' 'true')", + parquet_file = parquet_file); + + let mut commands = Vec::new(); let queries_file = env::var("QUERIES_FILE").unwrap(); //'/home/ubuntu/queries.sql' - let sql = format!("CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '{parquet_file}'"); - let _ = ctx.sql(&sql).await.unwrap().collect().await.unwrap(); - // Read queries from file + exec::exec_from_commands(&ctx, vec![base_command], true).await?; let queries = fs::read_to_string(queries_file).unwrap(); - let mut total_elapsed = 0.0; for query in queries.lines() { - fs::write("/tmp/query.sql", &query).unwrap(); - - for iteration in 1..=TRIES { - // Create the query plan - let df = ctx.sql(&query).await.unwrap(); - let logical_plan = df.logical_plan().clone(); - let physical_plan = state.create_physical_plan(&logical_plan).await.unwrap(); - - // Add coalesce - let exec_plan: Arc = - Arc::new(CoalesceBatchesExec::new(physical_plan, 1000000)); - let task_ctx = ctx.task_ctx(); - let repartitioned = Arc::new( - RepartitionExec::try_new(exec_plan, Partitioning::RoundRobinBatch(1)).unwrap(), - ); - let start = Instant::now(); - let _ = collect(repartitioned, task_ctx).await.unwrap(); - let elapsed = start.elapsed().as_secs_f64(); - total_elapsed += elapsed; - println!("Query {query_num} iteration {iteration} took {elapsed} seconds"); - } - query_num += 1; + commands.push(query.to_string()); } - println!("Total time: {total_elapsed} seconds"); + exec::exec_from_commands(&ctx, commands, false).await?; + + Ok(()) } pub mod error { From f70a0d19b0d067c3290dcc7cb89bbcd0a128b038 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Wed, 26 Feb 2025 01:54:48 -0500 Subject: [PATCH 25/32] deleted unused --- Cargo.toml | 1 + src/main.rs | 2 +- src/query/cli_context.rs | 10 +- src/query/exec.rs | 13 ++- src/query/functions.rs | 56 ++-------- src/query/helper.rs | 225 --------------------------------------- src/query/highlighter.rs | 127 ---------------------- src/query/mod.rs | 6 +- 8 files changed, 29 insertions(+), 411 deletions(-) delete mode 100644 src/query/helper.rs delete mode 100644 src/query/highlighter.rs diff --git a/Cargo.toml b/Cargo.toml index b08dc6994..1a262c127 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -72,6 +72,7 @@ tokio = { version = "1.28", default-features = false, features = [ "sync", "macros", "fs", + "rt-multi-thread" ] } tokio-stream = { version = "0.1", features = ["fs"] } diff --git a/src/main.rs b/src/main.rs index 069306951..95bb914a5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -32,7 +32,7 @@ use tracing_subscriber::EnvFilter; ))] use parseable::kafka; -#[actix_web::main] +#[tokio::main] async fn main() -> anyhow::Result<()> { tracing_subscriber::fmt() .with_env_filter(EnvFilter::from_default_env()) diff --git a/src/query/cli_context.rs b/src/query/cli_context.rs index 152f084f3..6b6c99b37 100644 --- a/src/query/cli_context.rs +++ b/src/query/cli_context.rs @@ -48,7 +48,10 @@ pub trait CliSessionContext { fn register_table_options_extension_from_scheme(&self, scheme: &str); /// Execute a logical plan and return a DataFrame. - async fn execute_logical_plan(&self, plan: LogicalPlan) -> Result; + async fn execute_logical_plan( + &self, + plan: LogicalPlan, + ) -> Result; } #[async_trait::async_trait] @@ -86,7 +89,10 @@ impl CliSessionContext for SessionContext { } } - async fn execute_logical_plan(&self, plan: LogicalPlan) -> Result { + async fn execute_logical_plan( + &self, + plan: LogicalPlan, + ) -> Result { self.execute_logical_plan(plan).await } } diff --git a/src/query/exec.rs b/src/query/exec.rs index 5ef8b2414..fe8e993e7 100644 --- a/src/query/exec.rs +++ b/src/query/exec.rs @@ -18,10 +18,10 @@ //! Execution functions use std::collections::HashMap; -use std::time::Instant; - use super::cli_context::CliSessionContext; use super::object_storage::get_object_store; + +use datafusion::common::instant::Instant; use datafusion::common::{plan_datafusion_err, plan_err}; use datafusion::config::ConfigFileType; use datafusion::datasource::listing::ListingTableUrl; @@ -32,6 +32,8 @@ use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties use datafusion::sql::parser::{DFParser, Statement}; use datafusion::sql::sqlparser::dialect::dialect_from_str; + +/// run and execute SQL statements and commands, against a context with the given print options pub async fn exec_from_commands( ctx: &dyn CliSessionContext, commands: Vec, @@ -64,7 +66,10 @@ pub async fn exec_from_commands( Ok(()) } -pub(super) async fn exec_and_print(ctx: &dyn CliSessionContext, sql: String) -> Result<()> { +pub(super) async fn exec_and_print( + ctx: &dyn CliSessionContext, + sql: String, +) -> Result<()> { let task_ctx = ctx.task_ctx(); let dialect = &task_ctx.session_config().options().sql_parser.dialect; let dialect = dialect_from_str(dialect).ok_or_else(|| { @@ -77,6 +82,7 @@ pub(super) async fn exec_and_print(ctx: &dyn CliSessionContext, sql: String) -> let statements = DFParser::parse_sql_with_dialect(&sql, dialect.as_ref())?; for statement in statements { + let plan = create_plan(ctx, statement).await?; let df = ctx.execute_logical_plan(plan).await?; @@ -101,6 +107,7 @@ pub(super) async fn exec_and_print(ctx: &dyn CliSessionContext, sql: String) -> Ok(()) } + fn config_file_type_from_str(ext: &str) -> Option { match ext.to_lowercase().as_str() { "csv" => Some(ConfigFileType::CSV), diff --git a/src/query/functions.rs b/src/query/functions.rs index 2864624b3..f74675248 100644 --- a/src/query/functions.rs +++ b/src/query/functions.rs @@ -17,9 +17,7 @@ //! Functions that are query-able and searchable via the `\h` command -use std::fmt; use std::fs::File; -use std::str::FromStr; use std::sync::Arc; use arrow::array::{Int64Array, StringArray}; @@ -41,47 +39,6 @@ use parquet::file::reader::FileReader; use parquet::file::serialized_reader::SerializedFileReader; use parquet::file::statistics::Statistics; -#[derive(Debug)] -pub enum Function { - Select, - Explain, - Show, - CreateTable, - CreateTableAs, - Insert, - DropTable, -} - -impl FromStr for Function { - type Err = (); - - fn from_str(s: &str) -> Result { - Ok(match s.trim().to_uppercase().as_str() { - "SELECT" => Self::Select, - "EXPLAIN" => Self::Explain, - "SHOW" => Self::Show, - "CREATE TABLE" => Self::CreateTable, - "CREATE TABLE AS" => Self::CreateTableAs, - "INSERT" => Self::Insert, - "DROP TABLE" => Self::DropTable, - _ => return Err(()), - }) - } -} - -impl fmt::Display for Function { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match *self { - Function::Select => write!(f, "SELECT"), - Function::Explain => write!(f, "EXPLAIN"), - Function::Show => write!(f, "SHOW"), - Function::CreateTable => write!(f, "CREATE TABLE"), - Function::CreateTableAs => write!(f, "CREATE TABLE AS"), - Function::Insert => write!(f, "INSERT"), - Function::DropTable => write!(f, "DROP TABLE"), - } - } -} /// PARQUET_META table function #[derive(Debug)] @@ -194,13 +151,14 @@ impl TableFunctionImpl for ParquetMetadataFunc { Some(Expr::Literal(ScalarValue::Utf8(Some(s)))) => s, // single quote: parquet_metadata('x.parquet') Some(Expr::Column(Column { name, .. })) => name, // double quote: parquet_metadata("x.parquet") _ => { - return plan_err!("parquet_metadata requires string argument as its input"); + return plan_err!( + "parquet_metadata requires string argument as its input" + ); } }; let file = File::open(filename.clone())?; - let reader = - SerializedFileReader::new(file).map_err(datafusion::error::DataFusionError::from)?; + let reader = SerializedFileReader::new(file)?; let metadata = reader.metadata(); let schema = Arc::new(Schema::new(vec![ @@ -268,11 +226,13 @@ impl TableFunctionImpl for ParquetMetadataFunc { let converted_type = column.column_descr().converted_type(); if let Some(s) = column.statistics() { - let (min_val, max_val) = convert_parquet_statistics(s, converted_type); + let (min_val, max_val) = + convert_parquet_statistics(s, converted_type); stats_min_arr.push(min_val.clone()); stats_max_arr.push(max_val.clone()); stats_null_count_arr.push(s.null_count_opt().map(|c| c as i64)); - stats_distinct_count_arr.push(s.distinct_count_opt().map(|c| c as i64)); + stats_distinct_count_arr + .push(s.distinct_count_opt().map(|c| c as i64)); stats_min_value_arr.push(min_val); stats_max_value_arr.push(max_val); } else { diff --git a/src/query/helper.rs b/src/query/helper.rs deleted file mode 100644 index e2383197c..000000000 --- a/src/query/helper.rs +++ /dev/null @@ -1,225 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Helper that helps with interactive editing, including multi-line parsing and validation, -//! and auto-completion for file name during creating external table. - -use std::borrow::Cow; - -use super::highlighter::{NoSyntaxHighlighter, SyntaxHighlighter}; - -use datafusion::common::sql_datafusion_err; -use datafusion::error::DataFusionError; -use datafusion::sql::parser::{DFParser, Statement}; -use datafusion::sql::sqlparser::dialect::dialect_from_str; -use datafusion::sql::sqlparser::parser::ParserError; - -use rustyline::completion::{Completer, FilenameCompleter, Pair}; -use rustyline::error::ReadlineError; -use rustyline::highlight::{CmdKind, Highlighter}; -use rustyline::hint::Hinter; -use rustyline::validate::{ValidationContext, ValidationResult, Validator}; -use rustyline::{Context, Helper, Result}; - -pub struct CliHelper { - completer: FilenameCompleter, - dialect: String, - highlighter: Box, -} - -impl CliHelper { - pub fn new(dialect: &str, color: bool) -> Self { - let highlighter: Box = if !color { - Box::new(NoSyntaxHighlighter {}) - } else { - Box::new(SyntaxHighlighter::new(dialect)) - }; - Self { - completer: FilenameCompleter::new(), - dialect: dialect.into(), - highlighter, - } - } - - fn validate_input(&self, input: &str) -> Result { - if let Some(sql) = input.strip_suffix(';') { - let sql = match unescape_input(sql) { - Ok(sql) => sql, - Err(err) => { - return Ok(ValidationResult::Invalid(Some(format!( - " 🤔 Invalid statement: {err}", - )))) - } - }; - - let dialect = match dialect_from_str(&self.dialect) { - Some(dialect) => dialect, - None => { - return Ok(ValidationResult::Invalid(Some(format!( - " 🤔 Invalid dialect: {}", - self.dialect - )))) - } - }; - let lines = split_from_semicolon(sql); - for line in lines { - match DFParser::parse_sql_with_dialect(&line, dialect.as_ref()) { - Ok(statements) if statements.is_empty() => { - return Ok(ValidationResult::Invalid(Some( - " 🤔 You entered an empty statement".to_string(), - ))); - } - Ok(_statements) => {} - Err(err) => { - return Ok(ValidationResult::Invalid(Some(format!( - " 🤔 Invalid statement: {err}", - )))); - } - } - } - Ok(ValidationResult::Valid(None)) - } else if input.starts_with('\\') { - // command - Ok(ValidationResult::Valid(None)) - } else { - Ok(ValidationResult::Incomplete) - } - } -} - -impl Default for CliHelper { - fn default() -> Self { - Self::new("generic", false) - } -} - -impl Highlighter for CliHelper { - fn highlight<'l>(&self, line: &'l str, pos: usize) -> Cow<'l, str> { - self.highlighter.highlight(line, pos) - } - - fn highlight_char(&self, line: &str, pos: usize, kind: CmdKind) -> bool { - self.highlighter.highlight_char(line, pos, kind) - } -} - -impl Hinter for CliHelper { - type Hint = String; -} - -/// returns true if the current position is after the open quote for -/// creating an external table. -fn is_open_quote_for_location(line: &str, pos: usize) -> bool { - let mut sql = line[..pos].to_string(); - sql.push('\''); - if let Ok(stmts) = DFParser::parse_sql(&sql) { - if let Some(Statement::CreateExternalTable(_)) = stmts.back() { - return true; - } - } - false -} - -impl Completer for CliHelper { - type Candidate = Pair; - - fn complete( - &self, - line: &str, - pos: usize, - ctx: &Context<'_>, - ) -> std::result::Result<(usize, Vec), ReadlineError> { - if is_open_quote_for_location(line, pos) { - self.completer.complete(line, pos, ctx) - } else { - Ok((0, Vec::with_capacity(0))) - } - } -} - -impl Validator for CliHelper { - fn validate(&self, ctx: &mut ValidationContext<'_>) -> Result { - let input = ctx.input().trim_end(); - self.validate_input(input) - } -} - -impl Helper for CliHelper {} - -/// Unescape input string from readline. -/// -/// The data read from stdio will be escaped, so we need to unescape the input before executing the input -pub fn unescape_input(input: &str) -> datafusion::error::Result { - let mut chars = input.chars(); - - let mut result = String::with_capacity(input.len()); - while let Some(char) = chars.next() { - if char == '\\' { - if let Some(next_char) = chars.next() { - // https://static.rust-lang.org/doc/master/reference.html#literals - result.push(match next_char { - '0' => '\0', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - _ => { - return Err(sql_datafusion_err!(ParserError::TokenizerError(format!( - "unsupported escape char: '\\{}'", - next_char - )))) - } - }); - } - } else { - result.push(char); - } - } - - Ok(result) -} - -/// Splits a string which consists of multiple queries. -pub(crate) fn split_from_semicolon(sql: String) -> Vec { - let mut commands = Vec::new(); - let mut current_command = String::new(); - let mut in_single_quote = false; - let mut in_double_quote = false; - - for c in sql.chars() { - if c == '\'' && !in_double_quote { - in_single_quote = !in_single_quote; - } else if c == '"' && !in_single_quote { - in_double_quote = !in_double_quote; - } - - if c == ';' && !in_single_quote && !in_double_quote { - if !current_command.trim().is_empty() { - commands.push(format!("{};", current_command.trim())); - current_command.clear(); - } - } else { - current_command.push(c); - } - } - - if !current_command.trim().is_empty() { - commands.push(format!("{};", current_command.trim())); - } - - commands -} diff --git a/src/query/highlighter.rs b/src/query/highlighter.rs deleted file mode 100644 index f3e13ed5c..000000000 --- a/src/query/highlighter.rs +++ /dev/null @@ -1,127 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! The syntax highlighter. - -use std::{ - borrow::Cow::{self, Borrowed}, - fmt::Display, -}; - -use datafusion::sql::sqlparser::{ - dialect::{dialect_from_str, Dialect, GenericDialect}, - keywords::Keyword, - tokenizer::{Token, Tokenizer}, -}; -use rustyline::highlight::{CmdKind, Highlighter}; - -/// The syntax highlighter. -#[derive(Debug)] -pub struct SyntaxHighlighter { - dialect: Box, -} - -impl SyntaxHighlighter { - pub fn new(dialect: &str) -> Self { - let dialect = dialect_from_str(dialect).unwrap_or(Box::new(GenericDialect {})); - Self { dialect } - } -} - -pub struct NoSyntaxHighlighter {} - -impl Highlighter for NoSyntaxHighlighter {} - -impl Highlighter for SyntaxHighlighter { - fn highlight<'l>(&self, line: &'l str, _: usize) -> Cow<'l, str> { - let mut out_line = String::new(); - - // `with_unescape(false)` since we want to rebuild the original string. - let mut tokenizer = Tokenizer::new(self.dialect.as_ref(), line).with_unescape(false); - let tokens = tokenizer.tokenize(); - match tokens { - Ok(tokens) => { - for token in tokens.iter() { - match token { - Token::Word(w) if w.keyword != Keyword::NoKeyword => { - out_line.push_str(&Color::red(token)); - } - Token::SingleQuotedString(_) => { - out_line.push_str(&Color::green(token)); - } - other => out_line.push_str(&format!("{other}")), - } - } - out_line.into() - } - Err(_) => Borrowed(line), - } - } - - fn highlight_char(&self, line: &str, _pos: usize, _cmd: CmdKind) -> bool { - !line.is_empty() - } -} - -/// Convenient utility to return strings with [ANSI color](https://gist.github.com/JBlond/2fea43a3049b38287e5e9cefc87b2124). -struct Color {} - -impl Color { - fn green(s: impl Display) -> String { - format!("\x1b[92m{s}\x1b[0m") - } - - fn red(s: impl Display) -> String { - format!("\x1b[91m{s}\x1b[0m") - } -} - -#[cfg(test)] -mod tests { - use super::SyntaxHighlighter; - use rustyline::highlight::Highlighter; - - #[test] - fn highlighter_valid() { - let s = "SElect col_a from tab_1;"; - let highlighter = SyntaxHighlighter::new("generic"); - let out = highlighter.highlight(s, s.len()); - assert_eq!( - "\u{1b}[91mSElect\u{1b}[0m col_a \u{1b}[91mfrom\u{1b}[0m tab_1;", - out - ); - } - - #[test] - fn highlighter_valid_with_new_line() { - let s = "SElect col_a from tab_1\n WHERE col_b = 'なにか';"; - let highlighter = SyntaxHighlighter::new("generic"); - let out = highlighter.highlight(s, s.len()); - assert_eq!( - "\u{1b}[91mSElect\u{1b}[0m col_a \u{1b}[91mfrom\u{1b}[0m tab_1\n \u{1b}[91mWHERE\u{1b}[0m col_b = \u{1b}[92m'なにか'\u{1b}[0m;", - out - ); - } - - #[test] - fn highlighter_invalid() { - let s = "SElect col_a from tab_1 WHERE col_b = ';"; - let highlighter = SyntaxHighlighter::new("generic"); - let out = highlighter.highlight(s, s.len()); - assert_eq!("SElect col_a from tab_1 WHERE col_b = ';", out); - } -} diff --git a/src/query/mod.rs b/src/query/mod.rs index 46039425f..af637f5a8 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -20,8 +20,6 @@ pub mod catalog; pub mod exec; mod filter_optimizer; pub mod functions; -pub mod helper; -pub mod highlighter; mod listing_table_builder; pub mod object_storage; pub mod stream_schema_provider; @@ -649,8 +647,7 @@ pub async fn run_benchmark() -> Result<(), ExecuteError> { let parquet_file = env::var("PARQUET_LOCATION").unwrap(); //'/home/ubuntu/clickbench/hits.parquet' let base_command = - format!("CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '{parquet_file}' OPTIONS ('binary_as_string' 'true')", - parquet_file = parquet_file); + format!("CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '{parquet_file}'"); let mut commands = Vec::new(); let queries_file = env::var("QUERIES_FILE").unwrap(); //'/home/ubuntu/queries.sql' @@ -660,7 +657,6 @@ pub async fn run_benchmark() -> Result<(), ExecuteError> { commands.push(query.to_string()); } exec::exec_from_commands(&ctx, commands, false).await?; - Ok(()) } From b06149b4be402c5dd315f62cb29cc8012fe107c3 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Wed, 26 Feb 2025 04:40:02 -0500 Subject: [PATCH 26/32] working --- src/handlers/http/query.rs | 5 ++++- src/query/mod.rs | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/handlers/http/query.rs b/src/handlers/http/query.rs index 87cabb0b5..e86699e5f 100644 --- a/src/handlers/http/query.rs +++ b/src/handlers/http/query.rs @@ -69,7 +69,10 @@ pub struct Query { } pub async fn query(req: HttpRequest, query_request: Query) -> Result { - let _ = run_benchmark().await; + tokio::task::spawn_blocking(|| { + run_benchmark(); + }); + let session_state = QUERY_SESSION.state(); let raw_logical_plan = match session_state .create_logical_plan(&query_request.query) diff --git a/src/query/mod.rs b/src/query/mod.rs index af637f5a8..932cbb422 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -624,7 +624,7 @@ pub fn flatten_objects_for_count(objects: Vec) -> Vec { objects } } - +#[tokio::main(flavor = "multi_thread", worker_threads = 16)] pub async fn run_benchmark() -> Result<(), ExecuteError> { let mut session_config = SessionConfig::from_env()?.with_information_schema(true); From 74676b9634ffa98c68b8571e573b75ee51eba791 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Wed, 26 Feb 2025 06:29:26 -0500 Subject: [PATCH 27/32] register parquet --- src/handlers/http/query.rs | 2 +- src/query/mod.rs | 26 ++++++++++++++++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/handlers/http/query.rs b/src/handlers/http/query.rs index e86699e5f..18b7e71c9 100644 --- a/src/handlers/http/query.rs +++ b/src/handlers/http/query.rs @@ -70,7 +70,7 @@ pub struct Query { pub async fn query(req: HttpRequest, query_request: Query) -> Result { tokio::task::spawn_blocking(|| { - run_benchmark(); + run_benchmark().unwrap(); }); let session_state = QUERY_SESSION.state(); diff --git a/src/query/mod.rs b/src/query/mod.rs index 932cbb422..0f3180e72 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -26,6 +26,7 @@ pub mod stream_schema_provider; pub mod cli_context; +use arrow_schema::DataType; use catalog::DynamicObjectStoreCatalog; use chrono::NaiveDateTime; use chrono::{DateTime, Duration, Utc}; @@ -643,15 +644,11 @@ pub async fn run_benchmark() -> Result<(), ExecuteError> { ))); // register `parquet_metadata` table function to get metadata from parquet files ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {})); - let parquet_file = env::var("PARQUET_LOCATION").unwrap(); //'/home/ubuntu/clickbench/hits.parquet' - - let base_command = - format!("CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '{parquet_file}'"); - + register_hits(&ctx, &parquet_file).await?; + let mut commands = Vec::new(); let queries_file = env::var("QUERIES_FILE").unwrap(); //'/home/ubuntu/queries.sql' - exec::exec_from_commands(&ctx, vec![base_command], true).await?; let queries = fs::read_to_string(queries_file).unwrap(); for query in queries.lines() { commands.push(query.to_string()); @@ -660,6 +657,23 @@ pub async fn run_benchmark() -> Result<(), ExecuteError> { Ok(()) } +async fn register_hits(ctx: &SessionContext, parquet_file: &str) -> Result<()> { + let mut options: ParquetReadOptions<'_> = Default::default(); + options.table_partition_cols = vec![ + ("date".to_string(), DataType::Utf8), + ("hour".to_string(), DataType::Utf8), + ("minute".to_string(), DataType::Utf8), + ]; + ctx.register_parquet("hits", parquet_file, options) + .await + .map_err(|e| { + DataFusionError::Context( + format!("Registering 'hits' as {parquet_file}"), + Box::new(e), + ) + }) +} + pub mod error { use crate::{metadata::error::stream_info::MetadataError, storage::ObjectStorageError}; use datafusion::error::DataFusionError; From f77d1391889f2b87bc007254d61b62942d7ceac1 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Wed, 26 Feb 2025 21:43:47 -0500 Subject: [PATCH 28/32] object store registered with session context --- src/handlers/http/query.rs | 2 +- src/query/mod.rs | 65 +++++++++++++++++++++++++------------- 2 files changed, 44 insertions(+), 23 deletions(-) diff --git a/src/handlers/http/query.rs b/src/handlers/http/query.rs index 18b7e71c9..482d2ddee 100644 --- a/src/handlers/http/query.rs +++ b/src/handlers/http/query.rs @@ -70,7 +70,7 @@ pub struct Query { pub async fn query(req: HttpRequest, query_request: Query) -> Result { tokio::task::spawn_blocking(|| { - run_benchmark().unwrap(); + run_benchmark(CONFIG.storage()).unwrap(); }); let session_state = QUERY_SESSION.state(); diff --git a/src/query/mod.rs b/src/query/mod.rs index 0f3180e72..97a036b0e 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -26,7 +26,6 @@ pub mod stream_schema_provider; pub mod cli_context; -use arrow_schema::DataType; use catalog::DynamicObjectStoreCatalog; use chrono::NaiveDateTime; use chrono::{DateTime, Duration, Utc}; @@ -48,6 +47,7 @@ use once_cell::sync::Lazy; use relative_path::RelativePathBuf; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; +use std::collections::HashMap; use std::ops::Bound; use std::sync::Arc; use stream_schema_provider::collect_manifest_files; @@ -626,7 +626,7 @@ pub fn flatten_objects_for_count(objects: Vec) -> Vec { } } #[tokio::main(flavor = "multi_thread", worker_threads = 16)] -pub async fn run_benchmark() -> Result<(), ExecuteError> { +pub async fn run_benchmark(storage: Arc) -> Result<(), ExecuteError> { let mut session_config = SessionConfig::from_env()?.with_information_schema(true); session_config = session_config.with_batch_size(8192); @@ -634,18 +634,37 @@ pub async fn run_benchmark() -> Result<(), ExecuteError> { let rt_builder = RuntimeEnvBuilder::new(); // set memory pool size let runtime_env = rt_builder.build_arc()?; - + let state = SessionStateBuilder::new() + .with_default_features() + .with_config(session_config) + .with_runtime_env(runtime_env) + .build(); + let schema_provider = Arc::new(GlobalSchemaProvider { + storage: storage.get_object_store(), + }); + state + .catalog_list() + .catalog(&state.config_options().catalog.default_catalog) + .expect("default catalog is provided by datafusion") + .register_schema( + &state.config_options().catalog.default_schema, + schema_provider, + ) + .unwrap(); // enable dynamic file query - let ctx = SessionContext::new_with_config_rt(session_config, runtime_env).enable_url_table(); + let ctx = SessionContext::new_with_state(state).enable_url_table(); // install dynamic catalog provider that can register required object stores ctx.register_catalog_list(Arc::new(DynamicObjectStoreCatalog::new( ctx.state().catalog_list().clone(), ctx.state_weak_ref(), ))); + let mut table_options = HashMap::new(); + table_options.insert("binary_as_string", "true"); + // register `parquet_metadata` table function to get metadata from parquet files ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {})); - let parquet_file = env::var("PARQUET_LOCATION").unwrap(); //'/home/ubuntu/clickbench/hits.parquet' - register_hits(&ctx, &parquet_file).await?; + // let parquet_file = env::var("PARQUET_LOCATION").unwrap(); //'/home/ubuntu/clickbench/hits.parquet' + // register_hits(&ctx, &parquet_file).await?; let mut commands = Vec::new(); let queries_file = env::var("QUERIES_FILE").unwrap(); //'/home/ubuntu/queries.sql' @@ -657,22 +676,24 @@ pub async fn run_benchmark() -> Result<(), ExecuteError> { Ok(()) } -async fn register_hits(ctx: &SessionContext, parquet_file: &str) -> Result<()> { - let mut options: ParquetReadOptions<'_> = Default::default(); - options.table_partition_cols = vec![ - ("date".to_string(), DataType::Utf8), - ("hour".to_string(), DataType::Utf8), - ("minute".to_string(), DataType::Utf8), - ]; - ctx.register_parquet("hits", parquet_file, options) - .await - .map_err(|e| { - DataFusionError::Context( - format!("Registering 'hits' as {parquet_file}"), - Box::new(e), - ) - }) -} +// async fn register_hits(ctx: &SessionContext, parquet_file: &str) -> Result<()> { +// let mut options: ParquetReadOptions<'_> = Default::default(); +// options.table_partition_cols = vec![ +// ("date".to_string(), DataType::Utf8), +// ("hour".to_string(), DataType::Utf8), +// ("minute".to_string(), DataType::Utf8), +// ]; + +// ctx.register_parquet("hits", parquet_file, options) +// .await +// .map_err(|e| { +// DataFusionError::Context( +// format!("Registering 'hits' as {parquet_file}"), +// Box::new(e), +// ) +// }) + +// } pub mod error { use crate::{metadata::error::stream_info::MetadataError, storage::ObjectStorageError}; From 1f5897a6edbd0644bdac51c61acd4f74cd3c3e85 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Thu, 27 Feb 2025 13:35:28 -0500 Subject: [PATCH 29/32] removed unused --- Cargo.lock | 300 ++++++++++--------- src/query/catalog.rs | 203 ------------- src/query/cli_context.rs | 158 ++++------ src/query/exec.rs | 385 ++----------------------- src/query/functions.rs | 288 ------------------- src/query/mod.rs | 60 ++-- src/query/object_storage.rs | 431 ---------------------------- src/query/stream_schema_provider.rs | 4 +- 8 files changed, 261 insertions(+), 1568 deletions(-) delete mode 100644 src/query/catalog.rs delete mode 100644 src/query/functions.rs delete mode 100644 src/query/object_storage.rs diff --git a/Cargo.lock b/Cargo.lock index 8501453c4..d49857e58 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -431,9 +431,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.1.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6422e12ac345a0678d7a17e316238e3a40547ae7f92052b77bd86d5e0239f3fc" +checksum = "dc208515aa0151028e464cc94a692156e945ce5126abd3537bb7fd6ba2143ed1" dependencies = [ "arrow-arith", "arrow-array", @@ -452,9 +452,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "54.1.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23cf34bb1f48c41d3475927bcc7be498665b8e80b379b88f62a840337f8b8248" +checksum = "e07e726e2b3f7816a85c6a45b6ec118eeeabf0b2a8c208122ad949437181f49a" dependencies = [ "arrow-array", "arrow-buffer", @@ -466,9 +466,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.1.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb4a06d507f54b70a277be22a127c8ffe0cec6cd98c0ad8a48e77779bbda8223" +checksum = "a2262eba4f16c78496adfd559a29fe4b24df6088efc9985a873d58e92be022d5" dependencies = [ "ahash", "arrow-buffer", @@ -483,9 +483,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "54.1.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d69d326d5ad1cb82dcefa9ede3fee8fdca98f9982756b16f9cb142f4aa6edc89" +checksum = "4e899dade2c3b7f5642eb8366cfd898958bcca099cde6dfea543c7e8d3ad88d4" dependencies = [ "bytes", "half", @@ -494,9 +494,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.1.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "626e65bd42636a84a238bed49d09c8777e3d825bf81f5087a70111c2831d9870" +checksum = "4103d88c5b441525ed4ac23153be7458494c2b0c9a11115848fdb9b81f6f886a" dependencies = [ "arrow-array", "arrow-buffer", @@ -515,9 +515,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.1.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71c8f959f7a1389b1dbd883cdcd37c3ed12475329c111912f7f69dad8195d8c6" +checksum = "43d3cb0914486a3cae19a5cad2598e44e225d53157926d0ada03c20521191a65" dependencies = [ "arrow-array", "arrow-cast", @@ -531,9 +531,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.1.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1858e7c7d01c44cf71c21a85534fd1a54501e8d60d1195d0d6fbcc00f4b10754" +checksum = "0a329fb064477c9ec5f0870d2f5130966f91055c7c5bce2b3a084f116bc28c3b" dependencies = [ "arrow-buffer", "arrow-schema", @@ -543,9 +543,9 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "54.1.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9b3aaba47ed4b6146563c8b79ad0f7aa283f794cde0c057c656291b81196746" +checksum = "c7408f2bf3b978eddda272c7699f439760ebc4ac70feca25fefa82c5b8ce808d" dependencies = [ "arrow-arith", "arrow-array", @@ -570,9 +570,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.1.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6bb3f727f049884c7603f0364bc9315363f356b59e9f605ea76541847e06a1e" +checksum = "ddecdeab02491b1ce88885986e25002a3da34dd349f682c7cfe67bab7cc17b86" dependencies = [ "arrow-array", "arrow-buffer", @@ -585,9 +585,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.1.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35de94f165ed8830aede72c35f238763794f0d49c69d30c44d49c9834267ff8c" +checksum = "d03b9340013413eb84868682ace00a1098c81a5ebc96d279f7ebf9a4cac3c0fd" dependencies = [ "arrow-array", "arrow-buffer", @@ -605,9 +605,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "54.1.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8aa06e5f267dc53efbacb933485c79b6fc1685d3ffbe870a16ce4e696fb429da" +checksum = "f841bfcc1997ef6ac48ee0305c4dfceb1f7c786fe31e67c1186edf775e1f1160" dependencies = [ "arrow-array", "arrow-buffer", @@ -618,9 +618,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.1.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66f1144bb456a2f9d82677bd3abcea019217e572fc8f07de5a7bac4b2c56eb2c" +checksum = "1eeb55b0a0a83851aa01f2ca5ee5648f607e8506ba6802577afdda9d75cdedcd" dependencies = [ "arrow-array", "arrow-buffer", @@ -631,18 +631,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.1.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "105f01ec0090259e9a33a9263ec18ff223ab91a0ea9fbc18042f7e38005142f6" +checksum = "85934a9d0261e0fa5d4e2a5295107d743b543a6e0484a835d4b8db2da15306f9" dependencies = [ "serde", ] [[package]] name = "arrow-select" -version = "54.1.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f690752fdbd2dee278b5f1636fefad8f2f7134c85e20fd59c4199e15a39a6807" +checksum = "7e2932aece2d0c869dd2125feb9bd1709ef5c445daa3838ac4112dcfa0fda52c" dependencies = [ "ahash", "arrow-array", @@ -654,9 +654,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.1.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0fff9cd745a7039b66c47ecaf5954460f9fa12eed628f65170117ea93e64ee0" +checksum = "912e38bd6a7a7714c1d9b61df80315685553b7455e8a6045c27531d8ecd5b458" dependencies = [ "arrow-array", "arrow-buffer", @@ -671,12 +671,12 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.18" +version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df895a515f70646414f4b45c0b79082783b80552b373a68283012928df56f522" +checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" dependencies = [ "brotli 7.0.0", - "bzip2 0.4.4", + "bzip2", "flate2", "futures-core", "memchr", @@ -743,9 +743,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "aws-config" -version = "1.5.16" +version = "1.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50236e4d60fe8458de90a71c0922c761e41755adf091b1b03de1cef537179915" +checksum = "490aa7465ee685b2ced076bb87ef654a47724a7844e2c7d3af4e749ce5b875dd" dependencies = [ "aws-credential-types", "aws-runtime", @@ -810,9 +810,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.58.0" +version = "1.60.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16ff718c9ee45cc1ebd4774a0e086bb80a6ab752b4902edf1c9f56b86ee1f770" +checksum = "60186fab60b24376d3e33b9ff0a43485f99efd470e3b75a9160c849741d63d56" dependencies = [ "aws-credential-types", "aws-runtime", @@ -832,9 +832,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.59.0" +version = "1.61.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5183e088715cc135d8d396fdd3bc02f018f0da4c511f53cb8d795b6a31c55809" +checksum = "7033130ce1ee13e6018905b7b976c915963755aef299c1521897679d6cd4f8ef" dependencies = [ "aws-credential-types", "aws-runtime", @@ -854,9 +854,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.59.0" +version = "1.61.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9f944ef032717596639cea4a2118a3a457268ef51bbb5fde9637e54c465da00" +checksum = "c5c1cac7677179d622b4448b0d31bcb359185295dc6fca891920cfb17e2b5156" dependencies = [ "aws-credential-types", "aws-runtime", @@ -877,9 +877,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.8" +version = "1.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bc5bbd1e4a2648fd8c5982af03935972c24a2f9846b396de661d351ee3ce837" +checksum = "9bfe75fad52793ce6dec0dc3d4b1f388f038b5eb866c8d4d7f3a8e21b5ea5051" dependencies = [ "aws-credential-types", "aws-smithy-http", @@ -1180,15 +1180,16 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.5" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e" +checksum = "1230237285e3e10cde447185e8975408ae24deaa67205ce684805c25bc0c7937" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "constant_time_eq", + "memmap2", ] [[package]] @@ -1269,16 +1270,6 @@ dependencies = [ "bytes", ] -[[package]] -name = "bzip2" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" -dependencies = [ - "bzip2-sys", - "libc", -] - [[package]] name = "bzip2" version = "0.5.1" @@ -1421,9 +1412,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.29" +version = "4.5.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acebd8ad879283633b343856142139f2da2317c96b05b4dd6181c61e2480184" +checksum = "027bb0d98429ae334a8698531da7077bdf906419543a35a55c2cb1b66437d767" dependencies = [ "clap_builder", "clap_derive", @@ -1431,9 +1422,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.29" +version = "4.5.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6ba32cbda51c7e1dfd49acc1457ba1a7dec5b64fe360e828acb13ca8dc9c2f9" +checksum = "5589e0cba072e0f3d23791efac0fd8627b49c829c196a492e88168e6a669d863" dependencies = [ "anstream", "anstyle", @@ -1669,9 +1660,9 @@ dependencies = [ [[package]] name = "csv-core" -version = "0.1.11" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" dependencies = [ "memchr", ] @@ -1734,21 +1725,23 @@ checksum = "575f75dfd25738df5b91b8e43e14d44bda14637a58fae779fd2b064f8bf3e010" [[package]] name = "datafusion" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "arrow", "arrow-ipc", "arrow-schema", "async-trait", "bytes", - "bzip2 0.5.1", + "bzip2", "chrono", "datafusion-catalog", "datafusion-catalog-listing", "datafusion-common", "datafusion-common-runtime", + "datafusion-datasource", "datafusion-execution", "datafusion-expr", + "datafusion-expr-common", "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-nested", @@ -1782,7 +1775,7 @@ dependencies = [ [[package]] name = "datafusion-catalog" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "arrow", "async-trait", @@ -1801,40 +1794,28 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "arrow", - "async-compression", "async-trait", - "bytes", - "bzip2 0.5.1", - "chrono", "datafusion-catalog", "datafusion-common", - "datafusion-common-runtime", + "datafusion-datasource", "datafusion-execution", "datafusion-expr", "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", - "flate2", "futures", - "glob", - "itertools 0.14.0", "log", "object_store", - "rand 0.8.5", "tokio", - "tokio-util", - "url", - "xz2", - "zstd", ] [[package]] name = "datafusion-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "ahash", "arrow", @@ -1857,21 +1838,54 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" +dependencies = [ + "log", + "tokio", +] + +[[package]] +name = "datafusion-datasource" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ + "arrow", + "async-compression", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "flate2", + "futures", + "glob", + "itertools 0.14.0", "log", + "object_store", + "rand 0.8.5", "tokio", + "tokio-util", + "url", + "xz2", + "zstd", ] [[package]] name = "datafusion-doc" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" [[package]] name = "datafusion-execution" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "arrow", "dashmap", @@ -1889,7 +1903,7 @@ dependencies = [ [[package]] name = "datafusion-expr" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "arrow", "chrono", @@ -1909,10 +1923,11 @@ dependencies = [ [[package]] name = "datafusion-expr-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "arrow", "datafusion-common", + "indexmap 2.7.1", "itertools 0.14.0", "paste", ] @@ -1920,7 +1935,7 @@ dependencies = [ [[package]] name = "datafusion-functions" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "arrow", "arrow-buffer", @@ -1948,7 +1963,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "ahash", "arrow", @@ -1968,7 +1983,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "ahash", "arrow", @@ -1980,7 +1995,7 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "arrow", "arrow-ord", @@ -2000,7 +2015,7 @@ dependencies = [ [[package]] name = "datafusion-functions-table" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "arrow", "async-trait", @@ -2015,7 +2030,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "datafusion-common", "datafusion-doc", @@ -2031,7 +2046,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -2040,7 +2055,7 @@ dependencies = [ [[package]] name = "datafusion-macros" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "datafusion-expr", "quote", @@ -2050,7 +2065,7 @@ dependencies = [ [[package]] name = "datafusion-optimizer" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "arrow", "chrono", @@ -2068,7 +2083,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "ahash", "arrow", @@ -2089,7 +2104,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "ahash", "arrow", @@ -2102,7 +2117,7 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "arrow", "datafusion-common", @@ -2120,7 +2135,7 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "ahash", "arrow", @@ -2149,7 +2164,7 @@ dependencies = [ [[package]] name = "datafusion-sql" version = "45.0.0" -source = "git+https://github.com/apache/datafusion?branch=main#02cc22eebd7e422cbba6788971f2ffae103180e8" +source = "git+https://github.com/apache/datafusion?branch=main#1ae06a497e7c6b117c211c52b33445c2063b9921" dependencies = [ "arrow", "bigdecimal", @@ -2240,9 +2255,9 @@ dependencies = [ [[package]] name = "either" -version = "1.13.0" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +checksum = "b7914353092ddf589ad78f25c5c1c21b7f80b0ff8621e7c814c3485b5306da9d" [[package]] name = "encoding_rs" @@ -2325,9 +2340,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.35" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" +checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc" dependencies = [ "crc32fast", "miniz_oxide", @@ -2532,9 +2547,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e" +checksum = "5017294ff4bb30944501348f6f8e42e6ad28f42c8bbef7a74029aff064a4e3c2" dependencies = [ "atomic-waker", "bytes", @@ -2770,7 +2785,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "h2 0.4.7", + "h2 0.4.8", "http 1.2.0", "http-body 1.0.1", "httparse", @@ -3210,9 +3225,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.169" +version = "0.2.170" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" +checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828" [[package]] name = "libm" @@ -3341,6 +3356,15 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "memmap2" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" +dependencies = [ + "libc", +] + [[package]] name = "mime" version = "0.3.17" @@ -3728,9 +3752,9 @@ dependencies = [ [[package]] name = "parquet" -version = "54.1.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a01a0efa30bbd601ae85b375c728efdb211ade54390281628a7b16708beb235" +checksum = "f88838dca3b84d41444a0341b19f347e8098a3898b0f21536654b8b799e11abd" dependencies = [ "ahash", "arrow-array", @@ -4148,9 +4172,9 @@ checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" [[package]] name = "psm" -version = "0.1.24" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200b9ff220857e53e184257720a14553b2f4aa02577d2ed9842d45d4b9654810" +checksum = "f58e5423e24c18cc840e1c98370b3993c6649cd1678b4d24318bcf0a083cbe88" dependencies = [ "cc", ] @@ -4221,9 +4245,9 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.9" +version = "0.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c40286217b4ba3a71d644d752e6a0b71f13f1b6a2c5311acfcbe0c2418ed904" +checksum = "e46f3055866785f6b92bc6164b76be02ca8f2eb4b002c0354b28cf4c119e5944" dependencies = [ "cfg_aliases", "libc", @@ -4270,8 +4294,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" dependencies = [ "rand_chacha 0.9.0", - "rand_core 0.9.0", - "zerocopy 0.8.17", + "rand_core 0.9.2", + "zerocopy 0.8.20", ] [[package]] @@ -4291,7 +4315,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core 0.9.0", + "rand_core 0.9.2", ] [[package]] @@ -4305,12 +4329,12 @@ dependencies = [ [[package]] name = "rand_core" -version = "0.9.0" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b08f3c9802962f7e1b25113931d94f43ed9725bebc59db9d0c3e9a23b67e15ff" +checksum = "7a509b1a2ffbe92afab0e55c8fd99dea1c280e8171bd2d88682bb20bc41cbc2c" dependencies = [ "getrandom 0.3.1", - "zerocopy 0.8.17", + "zerocopy 0.8.20", ] [[package]] @@ -4393,9 +4417,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.8" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" +checksum = "82b568323e98e49e2a0899dcee453dd679fae22d69adf9b11dd508d1549b7e2f" dependencies = [ "bitflags 2.8.0", ] @@ -4523,7 +4547,7 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "h2 0.4.7", + "h2 0.4.8", "http 1.2.0", "http-body 1.0.1", "http-body-util", @@ -5115,9 +5139,9 @@ checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "stacker" -version = "0.1.18" +version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d08feb8f695b465baed819b03c128dc23f57a694510ab1f06c77f763975685e" +checksum = "d9156ebd5870ef293bfb43f91c7a74528d363ec0d424afe24160ed5a4343d08a" dependencies = [ "cc", "cfg-if", @@ -5253,9 +5277,9 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.16.0" +version = "3.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38c246215d7d24f48ae091a2902398798e05d978b24315d6efbc00ede9a8bb91" +checksum = "22e5a0acb1f3f55f65cc4a866c361b2fb2a0ff6366785ae6fbb5f85df07ba230" dependencies = [ "cfg-if", "fastrand 2.3.0", @@ -5523,7 +5547,7 @@ dependencies = [ "base64 0.22.1", "bytes", "flate2", - "h2 0.4.7", + "h2 0.4.8", "http 1.2.0", "http-body 1.0.1", "http-body-util", @@ -5722,9 +5746,9 @@ dependencies = [ [[package]] name = "typenum" -version = "1.17.0" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" +checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" [[package]] name = "ulid" @@ -5847,9 +5871,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.13.1" +version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ced87ca4be083373936a67f8de945faa23b6b42384bd5b64434850802c6dccd0" +checksum = "e0f540e3240398cce6128b64ba83fdbdd86129c16a3aa1a3a252efd66eb3d587" dependencies = [ "getrandom 0.3.1", "js-sys", @@ -6461,11 +6485,11 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.17" +version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa91407dacce3a68c56de03abe2760159582b846c6a4acd2f456618087f12713" +checksum = "dde3bb8c68a8f3f1ed4ac9221aad6b10cece3e60a8e2ea54a6a2dec806d0084c" dependencies = [ - "zerocopy-derive 0.8.17", + "zerocopy-derive 0.8.20", ] [[package]] @@ -6481,9 +6505,9 @@ dependencies = [ [[package]] name = "zerocopy-derive" -version = "0.8.17" +version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06718a168365cad3d5ff0bb133aad346959a2074bd4a85c121255a11304a8626" +checksum = "eea57037071898bf96a6da35fd626f4f27e9cee3ead2a6c703cf09d472b2e700" dependencies = [ "proc-macro2", "quote", @@ -6572,9 +6596,9 @@ dependencies = [ [[package]] name = "zstd" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" dependencies = [ "zstd-safe", ] diff --git a/src/query/catalog.rs b/src/query/catalog.rs deleted file mode 100644 index 494ef95ef..000000000 --- a/src/query/catalog.rs +++ /dev/null @@ -1,203 +0,0 @@ -use std::any::Any; -use std::sync::{Arc, Weak}; - -use datafusion::catalog::{CatalogProvider, CatalogProviderList, SchemaProvider}; - -use datafusion::common::plan_datafusion_err; -use datafusion::datasource::listing::ListingTableUrl; -use datafusion::datasource::TableProvider; -use datafusion::error::Result; -use datafusion::execution::context::SessionState; -use datafusion::execution::session_state::SessionStateBuilder; - -use async_trait::async_trait; -use dirs::home_dir; -use parking_lot::RwLock; - -use super::object_storage::{get_object_store, AwsOptions, GcpOptions}; - -/// Wraps another catalog, automatically register require object stores for the file locations -#[derive(Debug)] -pub struct DynamicObjectStoreCatalog { - inner: Arc, - state: Weak>, -} - -impl DynamicObjectStoreCatalog { - pub fn new(inner: Arc, state: Weak>) -> Self { - Self { inner, state } - } -} - -impl CatalogProviderList for DynamicObjectStoreCatalog { - fn as_any(&self) -> &dyn Any { - self - } - - fn register_catalog( - &self, - name: String, - catalog: Arc, - ) -> Option> { - self.inner.register_catalog(name, catalog) - } - - fn catalog_names(&self) -> Vec { - self.inner.catalog_names() - } - - fn catalog(&self, name: &str) -> Option> { - let state = self.state.clone(); - self.inner - .catalog(name) - .map(|catalog| Arc::new(DynamicObjectStoreCatalogProvider::new(catalog, state)) as _) - } -} - -/// Wraps another catalog provider -#[derive(Debug)] -struct DynamicObjectStoreCatalogProvider { - inner: Arc, - state: Weak>, -} - -impl DynamicObjectStoreCatalogProvider { - pub fn new(inner: Arc, state: Weak>) -> Self { - Self { inner, state } - } -} - -impl CatalogProvider for DynamicObjectStoreCatalogProvider { - fn as_any(&self) -> &dyn Any { - self - } - - fn schema_names(&self) -> Vec { - self.inner.schema_names() - } - - fn schema(&self, name: &str) -> Option> { - let state = self.state.clone(); - self.inner - .schema(name) - .map(|schema| Arc::new(DynamicObjectStoreSchemaProvider::new(schema, state)) as _) - } - - fn register_schema( - &self, - name: &str, - schema: Arc, - ) -> Result>> { - self.inner.register_schema(name, schema) - } -} - -/// Wraps another schema provider. [DynamicObjectStoreSchemaProvider] is responsible for registering the required -/// object stores for the file locations. -#[derive(Debug)] -struct DynamicObjectStoreSchemaProvider { - inner: Arc, - state: Weak>, -} - -impl DynamicObjectStoreSchemaProvider { - pub fn new(inner: Arc, state: Weak>) -> Self { - Self { inner, state } - } -} - -#[async_trait] -impl SchemaProvider for DynamicObjectStoreSchemaProvider { - fn as_any(&self) -> &dyn Any { - self - } - - fn table_names(&self) -> Vec { - self.inner.table_names() - } - - fn register_table( - &self, - name: String, - table: Arc, - ) -> Result>> { - self.inner.register_table(name, table) - } - - async fn table(&self, name: &str) -> Result>> { - let inner_table = self.inner.table(name).await; - if inner_table.is_ok() { - if let Some(inner_table) = inner_table? { - return Ok(Some(inner_table)); - } - } - - // if the inner schema provider didn't have a table by - // that name, try to treat it as a listing table - let mut state = self - .state - .upgrade() - .ok_or_else(|| plan_datafusion_err!("locking error"))? - .read() - .clone(); - let mut builder = SessionStateBuilder::from(state.clone()); - let optimized_name = substitute_tilde(name.to_owned()); - let table_url = ListingTableUrl::parse(optimized_name.as_str())?; - let scheme = table_url.scheme(); - let url = table_url.as_ref(); - - // If the store is already registered for this URL then `get_store` - // will return `Ok` which means we don't need to register it again. However, - // if `get_store` returns an `Err` then it means the corresponding store is - // not registered yet and we need to register it - match state.runtime_env().object_store_registry.get_store(url) { - Ok(_) => { /*Nothing to do here, store for this URL is already registered*/ } - Err(_) => { - // Register the store for this URL. Here we don't have access - // to any command options so the only choice is to use an empty collection - match scheme { - "s3" | "oss" | "cos" => { - if let Some(table_options) = builder.table_options() { - table_options.extensions.insert(AwsOptions::default()) - } - } - "gs" | "gcs" => { - if let Some(table_options) = builder.table_options() { - table_options.extensions.insert(GcpOptions::default()) - } - } - _ => {} - }; - state = builder.build(); - let store = get_object_store( - &state, - table_url.scheme(), - url, - &state.default_table_options(), - ) - .await?; - state.runtime_env().register_object_store(url, store); - } - } - self.inner.table(name).await - } - - fn deregister_table(&self, name: &str) -> Result>> { - self.inner.deregister_table(name) - } - - fn table_exist(&self, name: &str) -> bool { - self.inner.table_exist(name) - } -} - -pub fn substitute_tilde(cur: String) -> String { - if let Some(usr_dir_path) = home_dir() { - if let Some(usr_dir) = usr_dir_path.to_str() { - if cur.starts_with('~') && !usr_dir.is_empty() { - return cur.replacen('~', usr_dir, 1); - } - } - } - cur -} diff --git a/src/query/cli_context.rs b/src/query/cli_context.rs index 6b6c99b37..c55f31c3c 100644 --- a/src/query/cli_context.rs +++ b/src/query/cli_context.rs @@ -1,98 +1,60 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::sync::Arc; - -use datafusion::{ - dataframe::DataFrame, - error::DataFusionError, - execution::{context::SessionState, TaskContext}, - logical_expr::LogicalPlan, - prelude::SessionContext, -}; -use object_store::ObjectStore; - -use super::object_storage::{AwsOptions, GcpOptions}; - -#[async_trait::async_trait] -/// The CLI session context trait provides a way to have a session context that can be used with datafusion's CLI code. -pub trait CliSessionContext { - /// Get an atomic reference counted task context. - fn task_ctx(&self) -> Arc; - - /// Get the session state. - fn session_state(&self) -> SessionState; - - /// Register an object store with the session context. - fn register_object_store( - &self, - url: &url::Url, - object_store: Arc, - ) -> Option>; - - /// Register table options extension from scheme. - fn register_table_options_extension_from_scheme(&self, scheme: &str); - - /// Execute a logical plan and return a DataFrame. - async fn execute_logical_plan( - &self, - plan: LogicalPlan, - ) -> Result; -} - -#[async_trait::async_trait] -impl CliSessionContext for SessionContext { - fn task_ctx(&self) -> Arc { - self.task_ctx() - } - - fn session_state(&self) -> SessionState { - self.state() - } - - fn register_object_store( - &self, - url: &url::Url, - object_store: Arc, - ) -> Option> { - self.register_object_store(url, object_store) - } - - fn register_table_options_extension_from_scheme(&self, scheme: &str) { - match scheme { - // For Amazon S3 or Alibaba Cloud OSS - "s3" | "oss" | "cos" => { - // Register AWS specific table options in the session context: - self.register_table_options_extension(AwsOptions::default()) - } - // For Google Cloud Storage - "gs" | "gcs" => { - // Register GCP specific table options in the session context: - self.register_table_options_extension(GcpOptions::default()) - } - // For unsupported schemes, do nothing: - _ => {} - } - } - - async fn execute_logical_plan( - &self, - plan: LogicalPlan, - ) -> Result { - self.execute_logical_plan(plan).await - } -} +// // Licensed to the Apache Software Foundation (ASF) under one +// // or more contributor license agreements. See the NOTICE file +// // distributed with this work for additional information +// // regarding copyright ownership. The ASF licenses this file +// // to you under the Apache License, Version 2.0 (the +// // "License"); you may not use this file except in compliance +// // with the License. You may obtain a copy of the License at +// // +// // http://www.apache.org/licenses/LICENSE-2.0 +// // +// // Unless required by applicable law or agreed to in writing, +// // software distributed under the License is distributed on an +// // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// // KIND, either express or implied. See the License for the +// // specific language governing permissions and limitations +// // under the License. + +// use std::sync::Arc; + +// use datafusion::{ +// dataframe::DataFrame, +// error::DataFusionError, +// execution::{context::SessionState, TaskContext}, +// logical_expr::LogicalPlan, +// prelude::SessionContext, +// }; + +// #[async_trait::async_trait] +// /// The CLI session context trait provides a way to have a session context that can be used with datafusion's CLI code. +// pub trait CliSessionContext { +// /// Get an atomic reference counted task context. +// fn task_ctx(&self) -> Arc; + +// /// Get the session state. +// fn session_state(&self) -> SessionState; + +// /// Execute a logical plan and return a DataFrame. +// async fn execute_logical_plan( +// &self, +// plan: LogicalPlan, +// ) -> Result; +// } + +// #[async_trait::async_trait] +// impl CliSessionContext for SessionContext { +// fn task_ctx(&self) -> Arc { +// self.task_ctx() +// } + +// fn session_state(&self) -> SessionState { +// self.state() +// } + +// async fn execute_logical_plan( +// &self, +// plan: LogicalPlan, +// ) -> Result { +// self.execute_logical_plan(plan).await +// } +// } diff --git a/src/query/exec.rs b/src/query/exec.rs index fe8e993e7..a9866b73a 100644 --- a/src/query/exec.rs +++ b/src/query/exec.rs @@ -17,25 +17,19 @@ //! Execution functions -use std::collections::HashMap; -use super::cli_context::CliSessionContext; -use super::object_storage::get_object_store; - +// use super::cli_context::CliSessionContext; +// use super::object_storage::get_object_store; use datafusion::common::instant::Instant; -use datafusion::common::{plan_datafusion_err, plan_err}; -use datafusion::config::ConfigFileType; -use datafusion::datasource::listing::ListingTableUrl; -use datafusion::error::{DataFusionError, Result}; -use datafusion::logical_expr::{DdlStatement, LogicalPlan}; -use datafusion::physical_plan::execution_plan::EmissionType; -use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties}; -use datafusion::sql::parser::{DFParser, Statement}; +use datafusion::common::plan_datafusion_err; +use datafusion::error::Result; +use datafusion::physical_plan::collect; +use datafusion::prelude::SessionContext; +use datafusion::sql::parser::DFParser; use datafusion::sql::sqlparser::dialect::dialect_from_str; - /// run and execute SQL statements and commands, against a context with the given print options pub async fn exec_from_commands( - ctx: &dyn CliSessionContext, + ctx: &SessionContext, commands: Vec, base_command: bool, ) -> Result<()> { @@ -66,10 +60,7 @@ pub async fn exec_from_commands( Ok(()) } -pub(super) async fn exec_and_print( - ctx: &dyn CliSessionContext, - sql: String, -) -> Result<()> { +pub(super) async fn exec_and_print(ctx: &SessionContext, sql: String) -> Result<()> { let task_ctx = ctx.task_ctx(); let dialect = &task_ctx.session_config().options().sql_parser.dialect; let dialect = dialect_from_str(dialect).ok_or_else(|| { @@ -81,356 +72,14 @@ pub(super) async fn exec_and_print( })?; let statements = DFParser::parse_sql_with_dialect(&sql, dialect.as_ref())?; - for statement in statements { - - let plan = create_plan(ctx, statement).await?; - - let df = ctx.execute_logical_plan(plan).await?; - let physical_plan = df.create_physical_plan().await?; - - if physical_plan.boundedness().is_unbounded() { - if physical_plan.pipeline_behavior() == EmissionType::Final { - return plan_err!( - "The given query can generate a valid result only once \ - the source finishes, but the source is unbounded" - ); - } - // As the input stream comes, we can generate results. - // However, memory safety is not guaranteed. - let _ = execute_stream(physical_plan, task_ctx.clone())?; - } else { - // Bounded stream; collected results are printed after all input consumed. - let _ = collect(physical_plan, task_ctx.clone()).await?; - } - } - - Ok(()) -} - - -fn config_file_type_from_str(ext: &str) -> Option { - match ext.to_lowercase().as_str() { - "csv" => Some(ConfigFileType::CSV), - "json" => Some(ConfigFileType::JSON), - "parquet" => Some(ConfigFileType::PARQUET), - _ => None, - } -} - -async fn create_plan( - ctx: &dyn CliSessionContext, - statement: Statement, -) -> Result { - let mut plan = ctx.session_state().statement_to_plan(statement).await?; - - // Note that cmd is a mutable reference so that create_external_table function can remove all - // datafusion-cli specific options before passing through to datafusion. Otherwise, datafusion - // will raise Configuration errors. - if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { - // To support custom formats, treat error as None - let format = config_file_type_from_str(&cmd.file_type); - register_object_store_and_config_extensions(ctx, &cmd.location, &cmd.options, format) - .await?; - } - - if let LogicalPlan::Copy(copy_to) = &mut plan { - let format = config_file_type_from_str(©_to.file_type.get_ext()); - - register_object_store_and_config_extensions( - ctx, - ©_to.output_url, - ©_to.options, - format, - ) - .await?; - } - Ok(plan) -} - -/// Asynchronously registers an object store and its configuration extensions -/// to the session context. -/// -/// This function dynamically registers a cloud object store based on the given -/// location and options. It first parses the location to determine the scheme -/// and constructs the URL accordingly. Depending on the scheme, it also registers -/// relevant options. The function then alters the default table options with the -/// given custom options. Finally, it retrieves and registers the object store -/// in the session context. -/// -/// # Parameters -/// -/// * `ctx`: A reference to the `SessionContext` for registering the object store. -/// * `location`: A string reference representing the location of the object store. -/// * `options`: A reference to a hash map containing configuration options for -/// the object store. -/// -/// # Returns -/// -/// A `Result<()>` which is an Ok value indicating successful registration, or -/// an error upon failure. -/// -/// # Errors -/// -/// This function can return an error if the location parsing fails, options -/// alteration fails, or if the object store cannot be retrieved and registered -/// successfully. -pub(crate) async fn register_object_store_and_config_extensions( - ctx: &dyn CliSessionContext, - location: &String, - options: &HashMap, - format: Option, -) -> Result<()> { - // Parse the location URL to extract the scheme and other components - let table_path = ListingTableUrl::parse(location)?; + let statement = statements.front().unwrap(); + let plan = ctx.state().statement_to_plan(statement.clone()).await?; - // Extract the scheme (e.g., "s3", "gcs") from the parsed URL - let scheme = table_path.scheme(); - - // Obtain a reference to the URL - let url = table_path.as_ref(); - - // Register the options based on the scheme extracted from the location - ctx.register_table_options_extension_from_scheme(scheme); - - // Clone and modify the default table options based on the provided options - let mut table_options = ctx.session_state().default_table_options(); - if let Some(format) = format { - table_options.set_config_format(format); - } - table_options.alter_with_string_hash_map(options)?; - - // Retrieve the appropriate object store based on the scheme, URL, and modified table options - let store = get_object_store(&ctx.session_state(), scheme, url, &table_options).await?; - - // Register the retrieved object store in the session context's runtime environment - ctx.register_object_store(url, store); + let df = ctx.execute_logical_plan(plan).await?; + let physical_plan = df.create_physical_plan().await?; + // Bounded stream; collected results are printed after all input consumed. + let results = collect(physical_plan, task_ctx.clone()).await?; + println!("{:?}", results); Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - - use datafusion::common::plan_err; - - use datafusion::prelude::SessionContext; - use url::Url; - - async fn create_external_table_test(location: &str, sql: &str) -> Result<()> { - let ctx = SessionContext::new(); - let plan = ctx.state().create_logical_plan(sql).await?; - - if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { - let format = config_file_type_from_str(&cmd.file_type); - register_object_store_and_config_extensions(&ctx, &cmd.location, &cmd.options, format) - .await?; - } else { - return plan_err!("LogicalPlan is not a CreateExternalTable"); - } - - // Ensure the URL is supported by the object store - ctx.runtime_env() - .object_store(ListingTableUrl::parse(location)?)?; - - Ok(()) - } - - async fn copy_to_table_test(location: &str, sql: &str) -> Result<()> { - let ctx = SessionContext::new(); - // AWS CONFIG register. - - let plan = ctx.state().create_logical_plan(sql).await?; - - if let LogicalPlan::Copy(cmd) = &plan { - let format = config_file_type_from_str(&cmd.file_type.get_ext()); - register_object_store_and_config_extensions( - &ctx, - &cmd.output_url, - &cmd.options, - format, - ) - .await?; - } else { - return plan_err!("LogicalPlan is not a CreateExternalTable"); - } - - // Ensure the URL is supported by the object store - ctx.runtime_env() - .object_store(ListingTableUrl::parse(location)?)?; - - Ok(()) - } - - #[tokio::test] - async fn create_object_store_table_http() -> Result<()> { - // Should be OK - let location = "http://example.com/file.parquet"; - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET LOCATION '{location}'"); - create_external_table_test(location, &sql).await?; - - Ok(()) - } - #[tokio::test] - async fn copy_to_external_object_store_test() -> Result<()> { - let locations = vec![ - "s3://bucket/path/file.parquet", - "oss://bucket/path/file.parquet", - "cos://bucket/path/file.parquet", - "gcs://bucket/path/file.parquet", - ]; - let ctx = SessionContext::new(); - let task_ctx = ctx.task_ctx(); - let dialect = &task_ctx.session_config().options().sql_parser.dialect; - let dialect = dialect_from_str(dialect).ok_or_else(|| { - plan_datafusion_err!( - "Unsupported SQL dialect: {dialect}. Available dialects: \ - Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, \ - MsSQL, ClickHouse, BigQuery, Ansi." - ) - })?; - for location in locations { - let sql = format!("copy (values (1,2)) to '{}' STORED AS PARQUET;", location); - let statements = DFParser::parse_sql_with_dialect(&sql, dialect.as_ref())?; - for statement in statements { - //Should not fail - let mut plan = create_plan(&ctx, statement).await?; - if let LogicalPlan::Copy(copy_to) = &mut plan { - assert_eq!(copy_to.output_url, location); - assert_eq!(copy_to.file_type.get_ext(), "parquet".to_string()); - ctx.runtime_env() - .object_store_registry - .get_store(&Url::parse(©_to.output_url).unwrap())?; - } else { - return plan_err!("LogicalPlan is not a CopyTo"); - } - } - } - Ok(()) - } - - #[tokio::test] - async fn copy_to_object_store_table_s3() -> Result<()> { - let access_key_id = "fake_access_key_id"; - let secret_access_key = "fake_secret_access_key"; - let location = "s3://bucket/path/file.parquet"; - - // Missing region, use object_store defaults - let sql = format!("COPY (values (1,2)) TO '{location}' STORED AS PARQUET - OPTIONS ('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}')"); - copy_to_table_test(location, &sql).await?; - - Ok(()) - } - - #[tokio::test] - async fn create_object_store_table_s3() -> Result<()> { - let access_key_id = "fake_access_key_id"; - let secret_access_key = "fake_secret_access_key"; - let region = "fake_us-east-2"; - let session_token = "fake_session_token"; - let location = "s3://bucket/path/file.parquet"; - - // Missing region, use object_store defaults - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}') LOCATION '{location}'"); - create_external_table_test(location, &sql).await?; - - // Should be OK - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.region' '{region}', 'aws.session_token' '{session_token}') LOCATION '{location}'"); - create_external_table_test(location, &sql).await?; - - Ok(()) - } - - #[tokio::test] - async fn create_object_store_table_oss() -> Result<()> { - let access_key_id = "fake_access_key_id"; - let secret_access_key = "fake_secret_access_key"; - let endpoint = "fake_endpoint"; - let location = "oss://bucket/path/file.parquet"; - - // Should be OK - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.oss.endpoint' '{endpoint}') LOCATION '{location}'"); - create_external_table_test(location, &sql).await?; - - Ok(()) - } - - #[tokio::test] - async fn create_object_store_table_cos() -> Result<()> { - let access_key_id = "fake_access_key_id"; - let secret_access_key = "fake_secret_access_key"; - let endpoint = "fake_endpoint"; - let location = "cos://bucket/path/file.parquet"; - - // Should be OK - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.cos.endpoint' '{endpoint}') LOCATION '{location}'"); - create_external_table_test(location, &sql).await?; - - Ok(()) - } - - #[tokio::test] - async fn create_object_store_table_gcs() -> Result<()> { - let service_account_path = "fake_service_account_path"; - let service_account_key = - "{\"private_key\": \"fake_private_key.pem\",\"client_email\":\"fake_client_email\", \"private_key_id\":\"id\"}"; - let application_credentials_path = "fake_application_credentials_path"; - let location = "gcs://bucket/path/file.parquet"; - - // for service_account_path - let sql = format!( - "CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('gcp.service_account_path' '{service_account_path}') LOCATION '{location}'" - ); - let err = create_external_table_test(location, &sql) - .await - .unwrap_err(); - assert!(err.to_string().contains("os error 2")); - - // for service_account_key - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('gcp.service_account_key' '{service_account_key}') LOCATION '{location}'"); - let err = create_external_table_test(location, &sql) - .await - .unwrap_err() - .to_string(); - assert!(err.contains("No RSA key found in pem file"), "{err}"); - - // for application_credentials_path - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('gcp.application_credentials_path' '{application_credentials_path}') LOCATION '{location}'"); - let err = create_external_table_test(location, &sql) - .await - .unwrap_err(); - assert!(err.to_string().contains("os error 2")); - - Ok(()) - } - - #[tokio::test] - async fn create_external_table_local_file() -> Result<()> { - let location = "path/to/file.parquet"; - - // Ensure that local files are also registered - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET LOCATION '{location}'"); - create_external_table_test(location, &sql).await.unwrap(); - - Ok(()) - } - - #[tokio::test] - async fn create_external_table_format_option() -> Result<()> { - let location = "path/to/file.cvs"; - - // Test with format options - let sql = - format!("CREATE EXTERNAL TABLE test STORED AS CSV LOCATION '{location}' OPTIONS('format.has_header' 'true')"); - create_external_table_test(location, &sql).await.unwrap(); - - Ok(()) - } -} +} \ No newline at end of file diff --git a/src/query/functions.rs b/src/query/functions.rs deleted file mode 100644 index f74675248..000000000 --- a/src/query/functions.rs +++ /dev/null @@ -1,288 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Functions that are query-able and searchable via the `\h` command - -use std::fs::File; -use std::sync::Arc; - -use arrow::array::{Int64Array, StringArray}; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use arrow::record_batch::RecordBatch; -use datafusion::catalog::{Session, TableFunctionImpl}; -use datafusion::common::{plan_err, Column}; -use datafusion::datasource::TableProvider; -use datafusion::error::Result; -use datafusion::logical_expr::Expr; -use datafusion::physical_plan::memory::MemorySourceConfig; -use datafusion::physical_plan::ExecutionPlan; -use datafusion::scalar::ScalarValue; - -use async_trait::async_trait; -use parquet::basic::ConvertedType; -use parquet::data_type::{ByteArray, FixedLenByteArray}; -use parquet::file::reader::FileReader; -use parquet::file::serialized_reader::SerializedFileReader; -use parquet::file::statistics::Statistics; - - -/// PARQUET_META table function -#[derive(Debug)] -struct ParquetMetadataTable { - schema: SchemaRef, - batch: RecordBatch, -} - -#[async_trait] -impl TableProvider for ParquetMetadataTable { - fn as_any(&self) -> &dyn std::any::Any { - self - } - - fn schema(&self) -> arrow::datatypes::SchemaRef { - self.schema.clone() - } - - fn table_type(&self) -> datafusion::logical_expr::TableType { - datafusion::logical_expr::TableType::Base - } - - async fn scan( - &self, - _state: &dyn Session, - projection: Option<&Vec>, - _filters: &[Expr], - _limit: Option, - ) -> Result> { - Ok(MemorySourceConfig::try_new_exec( - &[vec![self.batch.clone()]], - TableProvider::schema(self), - projection.cloned(), - )?) - } -} - -fn convert_parquet_statistics( - value: &Statistics, - converted_type: ConvertedType, -) -> (Option, Option) { - match (value, converted_type) { - (Statistics::Boolean(val), _) => ( - val.min_opt().map(|v| v.to_string()), - val.max_opt().map(|v| v.to_string()), - ), - (Statistics::Int32(val), _) => ( - val.min_opt().map(|v| v.to_string()), - val.max_opt().map(|v| v.to_string()), - ), - (Statistics::Int64(val), _) => ( - val.min_opt().map(|v| v.to_string()), - val.max_opt().map(|v| v.to_string()), - ), - (Statistics::Int96(val), _) => ( - val.min_opt().map(|v| v.to_string()), - val.max_opt().map(|v| v.to_string()), - ), - (Statistics::Float(val), _) => ( - val.min_opt().map(|v| v.to_string()), - val.max_opt().map(|v| v.to_string()), - ), - (Statistics::Double(val), _) => ( - val.min_opt().map(|v| v.to_string()), - val.max_opt().map(|v| v.to_string()), - ), - (Statistics::ByteArray(val), ConvertedType::UTF8) => ( - byte_array_to_string(val.min_opt()), - byte_array_to_string(val.max_opt()), - ), - (Statistics::ByteArray(val), _) => ( - val.min_opt().map(|v| v.to_string()), - val.max_opt().map(|v| v.to_string()), - ), - (Statistics::FixedLenByteArray(val), ConvertedType::UTF8) => ( - fixed_len_byte_array_to_string(val.min_opt()), - fixed_len_byte_array_to_string(val.max_opt()), - ), - (Statistics::FixedLenByteArray(val), _) => ( - val.min_opt().map(|v| v.to_string()), - val.max_opt().map(|v| v.to_string()), - ), - } -} - -/// Convert to a string if it has utf8 encoding, otherwise print bytes directly -fn byte_array_to_string(val: Option<&ByteArray>) -> Option { - val.map(|v| { - v.as_utf8() - .map(|s| s.to_string()) - .unwrap_or_else(|_e| v.to_string()) - }) -} - -/// Convert to a string if it has utf8 encoding, otherwise print bytes directly -fn fixed_len_byte_array_to_string(val: Option<&FixedLenByteArray>) -> Option { - val.map(|v| { - v.as_utf8() - .map(|s| s.to_string()) - .unwrap_or_else(|_e| v.to_string()) - }) -} - -#[derive(Debug)] -pub struct ParquetMetadataFunc {} - -impl TableFunctionImpl for ParquetMetadataFunc { - fn call(&self, exprs: &[Expr]) -> Result> { - let filename = match exprs.first() { - Some(Expr::Literal(ScalarValue::Utf8(Some(s)))) => s, // single quote: parquet_metadata('x.parquet') - Some(Expr::Column(Column { name, .. })) => name, // double quote: parquet_metadata("x.parquet") - _ => { - return plan_err!( - "parquet_metadata requires string argument as its input" - ); - } - }; - - let file = File::open(filename.clone())?; - let reader = SerializedFileReader::new(file)?; - let metadata = reader.metadata(); - - let schema = Arc::new(Schema::new(vec![ - Field::new("filename", DataType::Utf8, true), - Field::new("row_group_id", DataType::Int64, true), - Field::new("row_group_num_rows", DataType::Int64, true), - Field::new("row_group_num_columns", DataType::Int64, true), - Field::new("row_group_bytes", DataType::Int64, true), - Field::new("column_id", DataType::Int64, true), - Field::new("file_offset", DataType::Int64, true), - Field::new("num_values", DataType::Int64, true), - Field::new("path_in_schema", DataType::Utf8, true), - Field::new("type", DataType::Utf8, true), - Field::new("stats_min", DataType::Utf8, true), - Field::new("stats_max", DataType::Utf8, true), - Field::new("stats_null_count", DataType::Int64, true), - Field::new("stats_distinct_count", DataType::Int64, true), - Field::new("stats_min_value", DataType::Utf8, true), - Field::new("stats_max_value", DataType::Utf8, true), - Field::new("compression", DataType::Utf8, true), - Field::new("encodings", DataType::Utf8, true), - Field::new("index_page_offset", DataType::Int64, true), - Field::new("dictionary_page_offset", DataType::Int64, true), - Field::new("data_page_offset", DataType::Int64, true), - Field::new("total_compressed_size", DataType::Int64, true), - Field::new("total_uncompressed_size", DataType::Int64, true), - ])); - - // construct record batch from metadata - let mut filename_arr = vec![]; - let mut row_group_id_arr = vec![]; - let mut row_group_num_rows_arr = vec![]; - let mut row_group_num_columns_arr = vec![]; - let mut row_group_bytes_arr = vec![]; - let mut column_id_arr = vec![]; - let mut file_offset_arr = vec![]; - let mut num_values_arr = vec![]; - let mut path_in_schema_arr = vec![]; - let mut type_arr = vec![]; - let mut stats_min_arr = vec![]; - let mut stats_max_arr = vec![]; - let mut stats_null_count_arr = vec![]; - let mut stats_distinct_count_arr = vec![]; - let mut stats_min_value_arr = vec![]; - let mut stats_max_value_arr = vec![]; - let mut compression_arr = vec![]; - let mut encodings_arr = vec![]; - let mut index_page_offset_arr = vec![]; - let mut dictionary_page_offset_arr = vec![]; - let mut data_page_offset_arr = vec![]; - let mut total_compressed_size_arr = vec![]; - let mut total_uncompressed_size_arr = vec![]; - for (rg_idx, row_group) in metadata.row_groups().iter().enumerate() { - for (col_idx, column) in row_group.columns().iter().enumerate() { - filename_arr.push(filename.clone()); - row_group_id_arr.push(rg_idx as i64); - row_group_num_rows_arr.push(row_group.num_rows()); - row_group_num_columns_arr.push(row_group.num_columns() as i64); - row_group_bytes_arr.push(row_group.total_byte_size()); - column_id_arr.push(col_idx as i64); - file_offset_arr.push(column.file_offset()); - num_values_arr.push(column.num_values()); - path_in_schema_arr.push(column.column_path().to_string()); - type_arr.push(column.column_type().to_string()); - let converted_type = column.column_descr().converted_type(); - - if let Some(s) = column.statistics() { - let (min_val, max_val) = - convert_parquet_statistics(s, converted_type); - stats_min_arr.push(min_val.clone()); - stats_max_arr.push(max_val.clone()); - stats_null_count_arr.push(s.null_count_opt().map(|c| c as i64)); - stats_distinct_count_arr - .push(s.distinct_count_opt().map(|c| c as i64)); - stats_min_value_arr.push(min_val); - stats_max_value_arr.push(max_val); - } else { - stats_min_arr.push(None); - stats_max_arr.push(None); - stats_null_count_arr.push(None); - stats_distinct_count_arr.push(None); - stats_min_value_arr.push(None); - stats_max_value_arr.push(None); - }; - compression_arr.push(format!("{:?}", column.compression())); - encodings_arr.push(format!("{:?}", column.encodings())); - index_page_offset_arr.push(column.index_page_offset()); - dictionary_page_offset_arr.push(column.dictionary_page_offset()); - data_page_offset_arr.push(column.data_page_offset()); - total_compressed_size_arr.push(column.compressed_size()); - total_uncompressed_size_arr.push(column.uncompressed_size()); - } - } - - let rb = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(StringArray::from(filename_arr)), - Arc::new(Int64Array::from(row_group_id_arr)), - Arc::new(Int64Array::from(row_group_num_rows_arr)), - Arc::new(Int64Array::from(row_group_num_columns_arr)), - Arc::new(Int64Array::from(row_group_bytes_arr)), - Arc::new(Int64Array::from(column_id_arr)), - Arc::new(Int64Array::from(file_offset_arr)), - Arc::new(Int64Array::from(num_values_arr)), - Arc::new(StringArray::from(path_in_schema_arr)), - Arc::new(StringArray::from(type_arr)), - Arc::new(StringArray::from(stats_min_arr)), - Arc::new(StringArray::from(stats_max_arr)), - Arc::new(Int64Array::from(stats_null_count_arr)), - Arc::new(Int64Array::from(stats_distinct_count_arr)), - Arc::new(StringArray::from(stats_min_value_arr)), - Arc::new(StringArray::from(stats_max_value_arr)), - Arc::new(StringArray::from(compression_arr)), - Arc::new(StringArray::from(encodings_arr)), - Arc::new(Int64Array::from(index_page_offset_arr)), - Arc::new(Int64Array::from(dictionary_page_offset_arr)), - Arc::new(Int64Array::from(data_page_offset_arr)), - Arc::new(Int64Array::from(total_compressed_size_arr)), - Arc::new(Int64Array::from(total_uncompressed_size_arr)), - ], - )?; - - let parquet_metadata = ParquetMetadataTable { schema, batch: rb }; - Ok(Arc::new(parquet_metadata)) - } -} diff --git a/src/query/mod.rs b/src/query/mod.rs index 97a036b0e..1b66ce734 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -16,17 +16,13 @@ * */ -pub mod catalog; pub mod exec; mod filter_optimizer; -pub mod functions; mod listing_table_builder; -pub mod object_storage; pub mod stream_schema_provider; pub mod cli_context; -use catalog::DynamicObjectStoreCatalog; use chrono::NaiveDateTime; use chrono::{DateTime, Duration, Utc}; use datafusion::arrow::record_batch::RecordBatch; @@ -41,7 +37,6 @@ use datafusion::logical_expr::{ Aggregate, Explain, Filter, LogicalPlan, PlanType, Projection, ToStringifiedPlan, }; use datafusion::prelude::*; -use functions::ParquetMetadataFunc; use itertools::Itertools; use once_cell::sync::Lazy; use relative_path::RelativePathBuf; @@ -626,8 +621,8 @@ pub fn flatten_objects_for_count(objects: Vec) -> Vec { } } #[tokio::main(flavor = "multi_thread", worker_threads = 16)] -pub async fn run_benchmark(storage: Arc) -> Result<(), ExecuteError> { - let mut session_config = SessionConfig::from_env()?.with_information_schema(true); +pub async fn run_benchmark(_storage: Arc) -> Result<(), ExecuteError> { + let mut session_config = SessionConfig::new().with_information_schema(true); session_config = session_config.with_batch_size(8192); @@ -639,32 +634,22 @@ pub async fn run_benchmark(storage: Arc) -> Result<() .with_config(session_config) .with_runtime_env(runtime_env) .build(); - let schema_provider = Arc::new(GlobalSchemaProvider { - storage: storage.get_object_store(), - }); + // let schema_provider = Arc::new(GlobalSchemaProvider { + // storage: storage.get_object_store(), + // }); state .catalog_list() .catalog(&state.config_options().catalog.default_catalog) - .expect("default catalog is provided by datafusion") - .register_schema( - &state.config_options().catalog.default_schema, - schema_provider, - ) - .unwrap(); + .expect("default catalog is provided by datafusion"); // enable dynamic file query let ctx = SessionContext::new_with_state(state).enable_url_table(); - // install dynamic catalog provider that can register required object stores - ctx.register_catalog_list(Arc::new(DynamicObjectStoreCatalog::new( - ctx.state().catalog_list().clone(), - ctx.state_weak_ref(), - ))); let mut table_options = HashMap::new(); table_options.insert("binary_as_string", "true"); // register `parquet_metadata` table function to get metadata from parquet files - ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {})); - // let parquet_file = env::var("PARQUET_LOCATION").unwrap(); //'/home/ubuntu/clickbench/hits.parquet' - // register_hits(&ctx, &parquet_file).await?; + //ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {})); + let parquet_file = env::var("PARQUET_LOCATION").unwrap(); //'/home/ubuntu/clickbench/hits.parquet' + register_hits(&ctx, &parquet_file).await?; let mut commands = Vec::new(); let queries_file = env::var("QUERIES_FILE").unwrap(); //'/home/ubuntu/queries.sql' @@ -676,24 +661,19 @@ pub async fn run_benchmark(storage: Arc) -> Result<() Ok(()) } -// async fn register_hits(ctx: &SessionContext, parquet_file: &str) -> Result<()> { -// let mut options: ParquetReadOptions<'_> = Default::default(); -// options.table_partition_cols = vec![ -// ("date".to_string(), DataType::Utf8), -// ("hour".to_string(), DataType::Utf8), -// ("minute".to_string(), DataType::Utf8), -// ]; +async fn register_hits(ctx: &SessionContext, parquet_file: &str) -> Result<()> { + let options: ParquetReadOptions<'_> = Default::default(); -// ctx.register_parquet("hits", parquet_file, options) -// .await -// .map_err(|e| { -// DataFusionError::Context( -// format!("Registering 'hits' as {parquet_file}"), -// Box::new(e), -// ) -// }) + ctx.register_parquet("hits", parquet_file, options) + .await + .map_err(|e| { + DataFusionError::Context( + format!("Registering 'hits' as {parquet_file}"), + Box::new(e), + ) + }) -// } +} pub mod error { use crate::{metadata::error::stream_info::MetadataError, storage::ObjectStorageError}; diff --git a/src/query/object_storage.rs b/src/query/object_storage.rs deleted file mode 100644 index bdc6d1bee..000000000 --- a/src/query/object_storage.rs +++ /dev/null @@ -1,431 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::any::Any; -use std::fmt::{Debug, Display}; -use std::sync::Arc; - -use datafusion::common::config::{ - ConfigEntry, ConfigExtension, ConfigField, ExtensionOptions, TableOptions, Visit, -}; -use datafusion::common::{config_err, exec_datafusion_err, exec_err}; -use datafusion::error::{DataFusionError, Result}; -use datafusion::execution::context::SessionState; - -use async_trait::async_trait; -use aws_config::BehaviorVersion; -use aws_credential_types::provider::ProvideCredentials; -use object_store::aws::{AmazonS3Builder, AwsCredential}; -use object_store::gcp::GoogleCloudStorageBuilder; -use object_store::http::HttpBuilder; -use object_store::{ClientOptions, CredentialProvider, ObjectStore}; -use url::Url; - -pub async fn get_s3_object_store_builder( - url: &Url, - aws_options: &AwsOptions, -) -> Result { - let AwsOptions { - access_key_id, - secret_access_key, - session_token, - region, - endpoint, - allow_http, - } = aws_options; - - let bucket_name = get_bucket_name(url)?; - let mut builder = AmazonS3Builder::from_env().with_bucket_name(bucket_name); - - if let (Some(access_key_id), Some(secret_access_key)) = (access_key_id, secret_access_key) { - builder = builder - .with_access_key_id(access_key_id) - .with_secret_access_key(secret_access_key); - - if let Some(session_token) = session_token { - builder = builder.with_token(session_token); - } - } else { - let config = aws_config::defaults(BehaviorVersion::latest()).load().await; - if let Some(region) = config.region() { - builder = builder.with_region(region.to_string()); - } - - let credentials = config - .credentials_provider() - .ok_or_else(|| { - DataFusionError::ObjectStore(object_store::Error::Generic { - store: "S3", - source: "Failed to get S3 credentials from the environment".into(), - }) - })? - .clone(); - - let credentials = Arc::new(S3CredentialProvider { credentials }); - builder = builder.with_credentials(credentials); - } - - if let Some(region) = region { - builder = builder.with_region(region); - } - - if let Some(endpoint) = endpoint { - // Make a nicer error if the user hasn't allowed http and the endpoint - // is http as the default message is "URL scheme is not allowed" - if let Ok(endpoint_url) = Url::try_from(endpoint.as_str()) { - if !matches!(allow_http, Some(true)) && endpoint_url.scheme() == "http" { - return config_err!( - "Invalid endpoint: {endpoint}. \ - HTTP is not allowed for S3 endpoints. \ - To allow HTTP, set 'aws.allow_http' to true" - ); - } - } - - builder = builder.with_endpoint(endpoint); - } - - if let Some(allow_http) = allow_http { - builder = builder.with_allow_http(*allow_http); - } - - Ok(builder) -} - -#[derive(Debug)] -struct S3CredentialProvider { - credentials: aws_credential_types::provider::SharedCredentialsProvider, -} - -#[async_trait] -impl CredentialProvider for S3CredentialProvider { - type Credential = AwsCredential; - - async fn get_credential(&self) -> object_store::Result> { - let creds = self.credentials.provide_credentials().await.map_err(|e| { - object_store::Error::Generic { - store: "S3", - source: Box::new(e), - } - })?; - Ok(Arc::new(AwsCredential { - key_id: creds.access_key_id().to_string(), - secret_key: creds.secret_access_key().to_string(), - token: creds.session_token().map(ToString::to_string), - })) - } -} - -pub fn get_oss_object_store_builder( - url: &Url, - aws_options: &AwsOptions, -) -> Result { - get_object_store_builder(url, aws_options, true) -} - -pub fn get_cos_object_store_builder( - url: &Url, - aws_options: &AwsOptions, -) -> Result { - get_object_store_builder(url, aws_options, false) -} - -fn get_object_store_builder( - url: &Url, - aws_options: &AwsOptions, - virtual_hosted_style_request: bool, -) -> Result { - let bucket_name = get_bucket_name(url)?; - let mut builder = AmazonS3Builder::from_env() - .with_virtual_hosted_style_request(virtual_hosted_style_request) - .with_bucket_name(bucket_name) - // oss/cos don't care about the "region" field - .with_region("do_not_care"); - - if let (Some(access_key_id), Some(secret_access_key)) = - (&aws_options.access_key_id, &aws_options.secret_access_key) - { - builder = builder - .with_access_key_id(access_key_id) - .with_secret_access_key(secret_access_key); - } - - if let Some(endpoint) = &aws_options.endpoint { - builder = builder.with_endpoint(endpoint); - } - - Ok(builder) -} - -pub fn get_gcs_object_store_builder( - url: &Url, - gs_options: &GcpOptions, -) -> Result { - let bucket_name = get_bucket_name(url)?; - let mut builder = GoogleCloudStorageBuilder::from_env().with_bucket_name(bucket_name); - - if let Some(service_account_path) = &gs_options.service_account_path { - builder = builder.with_service_account_path(service_account_path); - } - - if let Some(service_account_key) = &gs_options.service_account_key { - builder = builder.with_service_account_key(service_account_key); - } - - if let Some(application_credentials_path) = &gs_options.application_credentials_path { - builder = builder.with_application_credentials(application_credentials_path); - } - - Ok(builder) -} - -fn get_bucket_name(url: &Url) -> Result<&str> { - url.host_str().ok_or_else(|| { - DataFusionError::Execution(format!( - "Not able to parse bucket name from url: {}", - url.as_str() - )) - }) -} - -/// This struct encapsulates AWS options one uses when setting up object storage. -#[derive(Default, Debug, Clone)] -pub struct AwsOptions { - /// Access Key ID - pub access_key_id: Option, - /// Secret Access Key - pub secret_access_key: Option, - /// Session token - pub session_token: Option, - /// AWS Region - pub region: Option, - /// OSS or COS Endpoint - pub endpoint: Option, - /// Allow HTTP (otherwise will always use https) - pub allow_http: Option, -} - -impl ExtensionOptions for AwsOptions { - fn as_any(&self) -> &dyn Any { - self - } - - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - fn cloned(&self) -> Box { - Box::new(self.clone()) - } - - fn set(&mut self, key: &str, value: &str) -> Result<()> { - let (_key, aws_key) = key.split_once('.').unwrap_or((key, "")); - let (key, rem) = aws_key.split_once('.').unwrap_or((aws_key, "")); - match key { - "access_key_id" => { - self.access_key_id.set(rem, value)?; - } - "secret_access_key" => { - self.secret_access_key.set(rem, value)?; - } - "session_token" => { - self.session_token.set(rem, value)?; - } - "region" => { - self.region.set(rem, value)?; - } - "oss" | "cos" | "endpoint" => { - self.endpoint.set(rem, value)?; - } - "allow_http" => { - self.allow_http.set(rem, value)?; - } - _ => { - return config_err!("Config value \"{}\" not found on AwsOptions", rem); - } - } - Ok(()) - } - - fn entries(&self) -> Vec { - struct Visitor(Vec); - - impl Visit for Visitor { - fn some(&mut self, key: &str, value: V, description: &'static str) { - self.0.push(ConfigEntry { - key: key.to_string(), - value: Some(value.to_string()), - description, - }) - } - - fn none(&mut self, key: &str, description: &'static str) { - self.0.push(ConfigEntry { - key: key.to_string(), - value: None, - description, - }) - } - } - - let mut v = Visitor(vec![]); - self.access_key_id.visit(&mut v, "access_key_id", ""); - self.secret_access_key - .visit(&mut v, "secret_access_key", ""); - self.session_token.visit(&mut v, "session_token", ""); - self.region.visit(&mut v, "region", ""); - self.endpoint.visit(&mut v, "endpoint", ""); - self.allow_http.visit(&mut v, "allow_http", ""); - v.0 - } -} - -impl ConfigExtension for AwsOptions { - const PREFIX: &'static str = "aws"; -} - -/// This struct encapsulates GCP options one uses when setting up object storage. -#[derive(Debug, Clone, Default)] -pub struct GcpOptions { - /// Service account path - pub service_account_path: Option, - /// Service account key - pub service_account_key: Option, - /// Application credentials path - pub application_credentials_path: Option, -} - -impl ExtensionOptions for GcpOptions { - fn as_any(&self) -> &dyn Any { - self - } - - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - fn cloned(&self) -> Box { - Box::new(self.clone()) - } - - fn set(&mut self, key: &str, value: &str) -> Result<()> { - let (_key, rem) = key.split_once('.').unwrap_or((key, "")); - match rem { - "service_account_path" => { - self.service_account_path.set(rem, value)?; - } - "service_account_key" => { - self.service_account_key.set(rem, value)?; - } - "application_credentials_path" => { - self.application_credentials_path.set(rem, value)?; - } - _ => { - return config_err!("Config value \"{}\" not found on GcpOptions", rem); - } - } - Ok(()) - } - - fn entries(&self) -> Vec { - struct Visitor(Vec); - - impl Visit for Visitor { - fn some(&mut self, key: &str, value: V, description: &'static str) { - self.0.push(ConfigEntry { - key: key.to_string(), - value: Some(value.to_string()), - description, - }) - } - - fn none(&mut self, key: &str, description: &'static str) { - self.0.push(ConfigEntry { - key: key.to_string(), - value: None, - description, - }) - } - } - - let mut v = Visitor(vec![]); - self.service_account_path - .visit(&mut v, "service_account_path", ""); - self.service_account_key - .visit(&mut v, "service_account_key", ""); - self.application_credentials_path - .visit(&mut v, "application_credentials_path", ""); - v.0 - } -} - -impl ConfigExtension for GcpOptions { - const PREFIX: &'static str = "gcp"; -} - -pub(crate) async fn get_object_store( - state: &SessionState, - scheme: &str, - url: &Url, - table_options: &TableOptions, -) -> Result, DataFusionError> { - let store: Arc = match scheme { - "s3" => { - let Some(options) = table_options.extensions.get::() else { - return exec_err!("Given table options incompatible with the 's3' scheme"); - }; - let builder = get_s3_object_store_builder(url, options).await?; - Arc::new(builder.build()?) - } - "oss" => { - let Some(options) = table_options.extensions.get::() else { - return exec_err!("Given table options incompatible with the 'oss' scheme"); - }; - let builder = get_oss_object_store_builder(url, options)?; - Arc::new(builder.build()?) - } - "cos" => { - let Some(options) = table_options.extensions.get::() else { - return exec_err!("Given table options incompatible with the 'cos' scheme"); - }; - let builder = get_cos_object_store_builder(url, options)?; - Arc::new(builder.build()?) - } - "gs" | "gcs" => { - let Some(options) = table_options.extensions.get::() else { - return exec_err!("Given table options incompatible with the 'gs'/'gcs' scheme"); - }; - let builder = get_gcs_object_store_builder(url, options)?; - Arc::new(builder.build()?) - } - "http" | "https" => Arc::new( - HttpBuilder::new() - .with_client_options(ClientOptions::new().with_allow_http(true)) - .with_url(url.origin().ascii_serialization()) - .build()?, - ), - _ => { - // For other types, try to get from `object_store_registry`: - state - .runtime_env() - .object_store_registry - .get_store(url) - .map_err(|_| exec_datafusion_err!("Unsupported object store scheme: {}", scheme))? - } - }; - Ok(store) -} diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index b03c7849e..e2d7b1016 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -31,7 +31,7 @@ use datafusion::catalog::Session; use datafusion::common::stats::Precision; use datafusion::common::Constraints; use datafusion::config::TableParquetOptions; -use datafusion::datasource::listing::file_compression_type::FileCompressionType; +use datafusion::datasource::file_format::file_compression_type::FileCompressionType; use datafusion::datasource::physical_plan::ParquetSource; use datafusion::logical_expr::utils::conjunction; use datafusion::physical_expr::LexOrdering; @@ -170,7 +170,7 @@ impl StandardTableProvider { constraints: Constraints::default(), file_compression_type: FileCompressionType::ZSTD, new_lines_in_values: false, - source: Arc::new(ParquetSource::new(TableParquetOptions::default())), + file_source: Arc::new(ParquetSource::new(TableParquetOptions::default())), }, filters.as_ref(), ) From 92e42bae62722ce6a03e6abc0d29e8ac973b2a72 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Thu, 27 Feb 2025 14:12:16 -0500 Subject: [PATCH 30/32] removed unused --- src/query/cli_context.rs | 60 ---------------------------------------- src/query/mod.rs | 4 +-- 2 files changed, 1 insertion(+), 63 deletions(-) delete mode 100644 src/query/cli_context.rs diff --git a/src/query/cli_context.rs b/src/query/cli_context.rs deleted file mode 100644 index c55f31c3c..000000000 --- a/src/query/cli_context.rs +++ /dev/null @@ -1,60 +0,0 @@ -// // Licensed to the Apache Software Foundation (ASF) under one -// // or more contributor license agreements. See the NOTICE file -// // distributed with this work for additional information -// // regarding copyright ownership. The ASF licenses this file -// // to you under the Apache License, Version 2.0 (the -// // "License"); you may not use this file except in compliance -// // with the License. You may obtain a copy of the License at -// // -// // http://www.apache.org/licenses/LICENSE-2.0 -// // -// // Unless required by applicable law or agreed to in writing, -// // software distributed under the License is distributed on an -// // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// // KIND, either express or implied. See the License for the -// // specific language governing permissions and limitations -// // under the License. - -// use std::sync::Arc; - -// use datafusion::{ -// dataframe::DataFrame, -// error::DataFusionError, -// execution::{context::SessionState, TaskContext}, -// logical_expr::LogicalPlan, -// prelude::SessionContext, -// }; - -// #[async_trait::async_trait] -// /// The CLI session context trait provides a way to have a session context that can be used with datafusion's CLI code. -// pub trait CliSessionContext { -// /// Get an atomic reference counted task context. -// fn task_ctx(&self) -> Arc; - -// /// Get the session state. -// fn session_state(&self) -> SessionState; - -// /// Execute a logical plan and return a DataFrame. -// async fn execute_logical_plan( -// &self, -// plan: LogicalPlan, -// ) -> Result; -// } - -// #[async_trait::async_trait] -// impl CliSessionContext for SessionContext { -// fn task_ctx(&self) -> Arc { -// self.task_ctx() -// } - -// fn session_state(&self) -> SessionState { -// self.state() -// } - -// async fn execute_logical_plan( -// &self, -// plan: LogicalPlan, -// ) -> Result { -// self.execute_logical_plan(plan).await -// } -// } diff --git a/src/query/mod.rs b/src/query/mod.rs index 1b66ce734..1c5f53e97 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -21,8 +21,6 @@ mod filter_optimizer; mod listing_table_builder; pub mod stream_schema_provider; -pub mod cli_context; - use chrono::NaiveDateTime; use chrono::{DateTime, Duration, Utc}; use datafusion::arrow::record_batch::RecordBatch; @@ -642,7 +640,7 @@ pub async fn run_benchmark(_storage: Arc) -> Result<( .catalog(&state.config_options().catalog.default_catalog) .expect("default catalog is provided by datafusion"); // enable dynamic file query - let ctx = SessionContext::new_with_state(state).enable_url_table(); + let ctx = SessionContext::new_with_state(state); let mut table_options = HashMap::new(); table_options.insert("binary_as_string", "true"); From 4d2b7ecc59801878d43ef773e6e8341d578c2ad9 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Sat, 1 Mar 2025 22:07:41 -0500 Subject: [PATCH 31/32] multi thread in parseable query --- src/handlers/http/query.rs | 15 +++++++-------- src/query/mod.rs | 29 +++++++++++++++++------------ 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/src/handlers/http/query.rs b/src/handlers/http/query.rs index 482d2ddee..e0efcdccb 100644 --- a/src/handlers/http/query.rs +++ b/src/handlers/http/query.rs @@ -69,9 +69,9 @@ pub struct Query { } pub async fn query(req: HttpRequest, query_request: Query) -> Result { - tokio::task::spawn_blocking(|| { - run_benchmark(CONFIG.storage()).unwrap(); - }); + // tokio::task::spawn_blocking(|| { + // run_benchmark(CONFIG.storage()).unwrap(); + // }); let session_state = QUERY_SESSION.state(); let raw_logical_plan = match session_state @@ -141,7 +141,10 @@ pub async fn query(req: HttpRequest, query_request: Query) -> Result Result) -> SessionContext { let runtime_config = storage - .get_datafusion_runtime() - .with_disk_manager(DiskManagerConfig::NewOs); + .get_datafusion_runtime(); let (pool_size, fraction) = match CONFIG.options.query_memory_pool_size { Some(size) => (size, 1.), @@ -140,6 +138,7 @@ impl Query { SessionContext::new_with_state(state) } + #[tokio::main(flavor = "multi_thread")] pub async fn execute( &self, stream_name: String, @@ -619,26 +618,32 @@ pub fn flatten_objects_for_count(objects: Vec) -> Vec { } } #[tokio::main(flavor = "multi_thread", worker_threads = 16)] -pub async fn run_benchmark(_storage: Arc) -> Result<(), ExecuteError> { +pub async fn run_benchmark(storage: Arc) -> Result<(), ExecuteError> { let mut session_config = SessionConfig::new().with_information_schema(true); session_config = session_config.with_batch_size(8192); - - let rt_builder = RuntimeEnvBuilder::new(); + let runtime_config = storage + .get_datafusion_runtime() + .with_disk_manager(DiskManagerConfig::NewOs); // set memory pool size - let runtime_env = rt_builder.build_arc()?; + let runtime_env = runtime_config.build_arc()?; let state = SessionStateBuilder::new() .with_default_features() .with_config(session_config) .with_runtime_env(runtime_env) .build(); - // let schema_provider = Arc::new(GlobalSchemaProvider { - // storage: storage.get_object_store(), - // }); + let schema_provider = Arc::new(GlobalSchemaProvider { + storage: storage.get_object_store(), + }); state .catalog_list() .catalog(&state.config_options().catalog.default_catalog) - .expect("default catalog is provided by datafusion"); + .expect("default catalog is provided by datafusion") + .register_schema( + &state.config_options().catalog.default_schema, + schema_provider, + ) + .unwrap(); // enable dynamic file query let ctx = SessionContext::new_with_state(state); let mut table_options = HashMap::new(); From d5373077f9b5c4c120001c203cedd11ae3e7a2ae Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Sun, 2 Mar 2025 12:51:51 -0500 Subject: [PATCH 32/32] drop cache --- src/handlers/http/query.rs | 18 +++++++- src/query/exec.rs | 85 -------------------------------------- src/query/mod.rs | 81 ++++-------------------------------- 3 files changed, 25 insertions(+), 159 deletions(-) delete mode 100644 src/query/exec.rs diff --git a/src/handlers/http/query.rs b/src/handlers/http/query.rs index e0efcdccb..2e80111be 100644 --- a/src/handlers/http/query.rs +++ b/src/handlers/http/query.rs @@ -29,6 +29,7 @@ use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; use std::collections::HashMap; use std::pin::Pin; +use std::process::Command; use std::sync::Arc; use std::time::Instant; use tracing::error; @@ -41,7 +42,7 @@ use crate::event::commit_schema; use crate::metrics::QUERY_EXECUTE_TIME; use crate::option::{Mode, CONFIG}; use crate::query::error::ExecuteError; -use crate::query::{run_benchmark, CountsRequest, CountsResponse, Query as LogicalQuery}; +use crate::query::{CountsRequest, CountsResponse, Query as LogicalQuery}; use crate::query::{TableScanVisitor, QUERY_SESSION}; use crate::rbac::Users; use crate::response::QueryResponse; @@ -157,7 +158,7 @@ pub async fn query(req: HttpRequest, query_request: Query) -> Result Option { Some(q) } +pub fn drop_system_caches() -> Result<(), QueryError> { + // Sync to flush file system buffers + Command::new("sync") + .status() + .expect("Failed to execute sync command"); + let _ = Command::new("sudo") + .args(["sh", "-c", "echo 3 > /proc/sys/vm/drop_caches"]) + .output() + .map_err(|e| QueryError::Anyhow(anyhow::Error::msg(e.to_string())))?; + + Ok(()) +} + #[derive(Debug, thiserror::Error)] pub enum QueryError { #[error("Query cannot be empty")] diff --git a/src/query/exec.rs b/src/query/exec.rs deleted file mode 100644 index a9866b73a..000000000 --- a/src/query/exec.rs +++ /dev/null @@ -1,85 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Execution functions - -// use super::cli_context::CliSessionContext; -// use super::object_storage::get_object_store; -use datafusion::common::instant::Instant; -use datafusion::common::plan_datafusion_err; -use datafusion::error::Result; -use datafusion::physical_plan::collect; -use datafusion::prelude::SessionContext; -use datafusion::sql::parser::DFParser; -use datafusion::sql::sqlparser::dialect::dialect_from_str; - -/// run and execute SQL statements and commands, against a context with the given print options -pub async fn exec_from_commands( - ctx: &SessionContext, - commands: Vec, - base_command: bool, -) -> Result<()> { - if !base_command { - const TRIES: usize = 3; - let mut query_num = 1; - let mut total_elapsed_per_iteration = vec![0.0; TRIES]; - for sql in commands.clone() { - for iteration in 1..=TRIES { - let start = Instant::now(); - exec_and_print(ctx, sql.clone()).await?; - let elapsed = start.elapsed().as_secs_f64(); - total_elapsed_per_iteration[iteration - 1] += elapsed; - println!("Query {query_num} iteration {iteration} took {elapsed} seconds"); - } - query_num += 1; - } - for (iteration, total_elapsed) in total_elapsed_per_iteration.iter().enumerate() { - println!( - "Total time for iteration {}: {} seconds", - iteration + 1, - total_elapsed - ); - } - } - exec_and_print(ctx, commands[0].clone()).await?; - - Ok(()) -} - -pub(super) async fn exec_and_print(ctx: &SessionContext, sql: String) -> Result<()> { - let task_ctx = ctx.task_ctx(); - let dialect = &task_ctx.session_config().options().sql_parser.dialect; - let dialect = dialect_from_str(dialect).ok_or_else(|| { - plan_datafusion_err!( - "Unsupported SQL dialect: {dialect}. Available dialects: \ - Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, \ - MsSQL, ClickHouse, BigQuery, Ansi." - ) - })?; - - let statements = DFParser::parse_sql_with_dialect(&sql, dialect.as_ref())?; - let statement = statements.front().unwrap(); - let plan = ctx.state().statement_to_plan(statement.clone()).await?; - - let df = ctx.execute_logical_plan(plan).await?; - let physical_plan = df.create_physical_plan().await?; - - // Bounded stream; collected results are printed after all input consumed. - let results = collect(physical_plan, task_ctx.clone()).await?; - println!("{:?}", results); - Ok(()) -} \ No newline at end of file diff --git a/src/query/mod.rs b/src/query/mod.rs index becfc5c97..df00ba244 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -16,7 +16,6 @@ * */ -pub mod exec; mod filter_optimizer; mod listing_table_builder; pub mod stream_schema_provider; @@ -27,7 +26,6 @@ use datafusion::arrow::record_batch::RecordBatch; use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeVisitor}; use datafusion::error::{DataFusionError, Result}; -use datafusion::execution::disk_manager::DiskManagerConfig; use datafusion::execution::SessionStateBuilder; use datafusion::logical_expr::expr::Alias; use datafusion::logical_expr::{ @@ -39,13 +37,10 @@ use once_cell::sync::Lazy; use relative_path::RelativePathBuf; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; -use std::collections::HashMap; use std::ops::Bound; use std::sync::Arc; use stream_schema_provider::collect_manifest_files; -use sysinfo::{MemoryRefreshKind, System}; - -use std::{env, fs}; +use sysinfo::System; use self::error::ExecuteError; use self::stream_schema_provider::GlobalSchemaProvider; @@ -55,7 +50,7 @@ use crate::catalog::manifest::Manifest; use crate::catalog::snapshot::Snapshot; use crate::catalog::Snapshot as CatalogSnapshot; use crate::event; -use crate::handlers::http::query::QueryError; +use crate::handlers::http::query::{drop_system_caches, QueryError}; use crate::metadata::STREAM_INFO; use crate::option::{Mode, CONFIG}; use crate::storage::{ObjectStorageProvider, ObjectStoreFormat, STREAM_ROOT_DIRECTORY}; @@ -74,6 +69,8 @@ pub struct Query { impl Query { // create session context for this query pub fn create_session_context(storage: Arc) -> SessionContext { + drop_system_caches().unwrap(); + let runtime_config = storage .get_datafusion_runtime(); @@ -109,11 +106,11 @@ impl Query { // Enable StringViewArray // https://www.influxdata.com/blog/faster-queries-with-stringview-part-one-influxdb/ - // config - // .options_mut() - // .execution - // .parquet - // .schema_force_view_types = true; + config + .options_mut() + .execution + .parquet + .schema_force_view_types = true; config.options_mut().execution.parquet.binary_as_string = true; let state = SessionStateBuilder::new() @@ -617,66 +614,6 @@ pub fn flatten_objects_for_count(objects: Vec) -> Vec { objects } } -#[tokio::main(flavor = "multi_thread", worker_threads = 16)] -pub async fn run_benchmark(storage: Arc) -> Result<(), ExecuteError> { - let mut session_config = SessionConfig::new().with_information_schema(true); - - session_config = session_config.with_batch_size(8192); - let runtime_config = storage - .get_datafusion_runtime() - .with_disk_manager(DiskManagerConfig::NewOs); - // set memory pool size - let runtime_env = runtime_config.build_arc()?; - let state = SessionStateBuilder::new() - .with_default_features() - .with_config(session_config) - .with_runtime_env(runtime_env) - .build(); - let schema_provider = Arc::new(GlobalSchemaProvider { - storage: storage.get_object_store(), - }); - state - .catalog_list() - .catalog(&state.config_options().catalog.default_catalog) - .expect("default catalog is provided by datafusion") - .register_schema( - &state.config_options().catalog.default_schema, - schema_provider, - ) - .unwrap(); - // enable dynamic file query - let ctx = SessionContext::new_with_state(state); - let mut table_options = HashMap::new(); - table_options.insert("binary_as_string", "true"); - - // register `parquet_metadata` table function to get metadata from parquet files - //ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {})); - let parquet_file = env::var("PARQUET_LOCATION").unwrap(); //'/home/ubuntu/clickbench/hits.parquet' - register_hits(&ctx, &parquet_file).await?; - - let mut commands = Vec::new(); - let queries_file = env::var("QUERIES_FILE").unwrap(); //'/home/ubuntu/queries.sql' - let queries = fs::read_to_string(queries_file).unwrap(); - for query in queries.lines() { - commands.push(query.to_string()); - } - exec::exec_from_commands(&ctx, commands, false).await?; - Ok(()) -} - -async fn register_hits(ctx: &SessionContext, parquet_file: &str) -> Result<()> { - let options: ParquetReadOptions<'_> = Default::default(); - - ctx.register_parquet("hits", parquet_file, options) - .await - .map_err(|e| { - DataFusionError::Context( - format!("Registering 'hits' as {parquet_file}"), - Box::new(e), - ) - }) - -} pub mod error { use crate::{metadata::error::stream_info::MetadataError, storage::ObjectStorageError};